xref: /xnu-12377.81.4/osfmk/kern/sched_prim.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_FREE_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	sched_prim.c
60  *	Author:	Avadis Tevanian, Jr.
61  *	Date:	1986
62  *
63  *	Scheduling primitives
64  *
65  */
66 
67 #include <debug.h>
68 
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74 
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80 
81 #include <machine/commpage.h>
82 
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/smr.h>
90 #include <kern/debug.h>
91 #include <kern/macro_help.h>
92 #include <kern/machine.h>
93 #include <kern/misc_protos.h>
94 #include <kern/monotonic.h>
95 #include <kern/processor.h>
96 #include <kern/queue.h>
97 #include <kern/recount.h>
98 #include <kern/restartable.h>
99 #include <kern/sched.h>
100 #include <kern/sched_prim.h>
101 #include <kern/sched_rt.h>
102 #include <kern/sfi.h>
103 #include <kern/syscall_subr.h>
104 #include <kern/task.h>
105 #include <kern/thread.h>
106 #include <kern/thread_group.h>
107 #include <kern/ledger.h>
108 #include <kern/timer_queue.h>
109 #include <kern/waitq.h>
110 #include <kern/policy_internal.h>
111 
112 #include <vm/pmap.h>
113 #include <vm/vm_kern.h>
114 #include <vm/vm_map.h>
115 #include <vm/vm_pageout_xnu.h>
116 
117 #include <mach/sdt.h>
118 #include <mach/mach_host.h>
119 #include <mach/host_info.h>
120 
121 #include <sys/kdebug.h>
122 #include <kperf/kperf.h>
123 #include <kern/kpc.h>
124 #include <san/kasan.h>
125 #include <kern/pms.h>
126 #include <kern/host.h>
127 #include <stdatomic.h>
128 #include <os/atomic_private.h>
129 #include <os/log.h>
130 
131 #define KTRC KDBG_RELEASE
132 
133 
134 struct sched_statistics PERCPU_DATA(sched_stats);
135 bool sched_stats_active;
136 
137 TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true);
138 
139 TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 50); /* microseconds */
140 static uint64_t nonurgent_preemption_timer_abs = 0;
141 
142 #define         DEFAULT_PREEMPTION_RATE         100             /* (1/s) */
143 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
144 
145 #define         DEFAULT_BG_PREEMPTION_RATE      400             /* (1/s) */
146 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
147 
148 #if XNU_TARGET_OS_XR
149 #define         MAX_UNSAFE_RT_QUANTA               1
150 #define         SAFE_RT_MULTIPLIER                 5
151 #else
152 #define         MAX_UNSAFE_RT_QUANTA               100
153 #define         SAFE_RT_MULTIPLIER                 2
154 #endif /* XNU_TARGET_OS_XR */
155 
156 #define         MAX_UNSAFE_FIXED_QUANTA               100
157 #define         SAFE_FIXED_MULTIPLIER                 SAFE_RT_MULTIPLIER
158 
159 TUNABLE_DEV_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
160 TUNABLE_DEV_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
161 
162 TUNABLE_DEV_WRITEABLE(int, safe_rt_multiplier, "safe_rt_multiplier", SAFE_RT_MULTIPLIER);
163 TUNABLE_DEV_WRITEABLE(int, safe_fixed_multiplier, "safe_fixed_multiplier", SAFE_FIXED_MULTIPLIER);
164 
165 #define         MAX_POLL_QUANTA                 2
166 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
167 
168 #define         SCHED_POLL_YIELD_SHIFT          4               /* 1/16 */
169 int             sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
170 
171 uint64_t        max_poll_computation;
172 
173 uint64_t        max_unsafe_rt_computation;
174 uint64_t        max_unsafe_fixed_computation;
175 uint64_t        sched_safe_rt_duration;
176 uint64_t        sched_safe_fixed_duration;
177 
178 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
179 
180 uint32_t        std_quantum;
181 uint32_t        min_std_quantum;
182 uint32_t        bg_quantum;
183 
184 uint32_t        std_quantum_us;
185 uint32_t        bg_quantum_us;
186 
187 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
188 
189 uint32_t        thread_depress_time;
190 uint32_t        default_timeshare_computation;
191 uint32_t        default_timeshare_constraint;
192 
193 
194 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
195 
196 _Atomic uint32_t        sched_tick;
197 uint32_t                sched_tick_interval;
198 
199 /* Timeshare load calculation interval (15ms) */
200 uint32_t                sched_load_compute_interval_us = 15000;
201 uint64_t                sched_load_compute_interval_abs;
202 static _Atomic uint64_t sched_load_compute_deadline;
203 
204 uint32_t        sched_pri_shifts[TH_BUCKET_MAX];
205 uint32_t        sched_fixed_shift;
206 
207 uint32_t        sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
208 
209 /* Allow foreground to decay past default to resolve inversions */
210 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
211 int             sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
212 
213 /* Defaults for timer deadline profiling */
214 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
215 	                                               * 2ms */
216 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
217 	                                               *   <= 5ms */
218 
219 uint64_t timer_deadline_tracking_bin_1;
220 uint64_t timer_deadline_tracking_bin_2;
221 
222 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
223 
224 thread_t sched_maintenance_thread;
225 
226 LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown");
227 LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp);
228 
229 /* interrupts disabled lock to guard core online, recommendation, pcs state, scheduling policy bits */
230 decl_simple_lock_data(, sched_available_cores_lock);
231 
232 /*
233  * Locked by sched_available_cores_lock.
234  * cluster_powerdown_lock is held while making changes to CPU offline state.
235  */
236 static struct global_powered_cores_state {
237 	/*
238 	 * Set when PCS has seen all cores boot up and is ready to manage online
239 	 * state.  CPU recommendation works before this point.
240 	 */
241 	bool    pcs_init_completed;
242 
243 	cpumap_t pcs_managed_cores;         /* all cores managed by the PCS */
244 
245 	/*
246 	 * Inputs for CPU offline state provided by clients
247 	 */
248 	cpumap_t pcs_requested_online_user; /* updated by processor_start/exit from userspace */
249 	cpumap_t pcs_requested_online_clpc_user;
250 	cpumap_t pcs_requested_online_clpc_system;
251 	cpumap_t pcs_required_online_pmgr;  /* e.g. ANE needs these powered for their rail to be happy */
252 	cpumap_t pcs_required_online_system;  /* e.g. smt1 for interrupts, boot processor unless boot arg is set, makes them disable instead of sleep */
253 
254 	/*
255 	 * When a suspend count is held, all CPUs must be powered up.
256 	 */
257 	int32_t  pcs_powerdown_suspend_count;
258 
259 	/*
260 	 * Disable automatic cluster powerdown in favor of explicit user core online control
261 	 */
262 	bool     pcs_user_online_core_control;
263 	bool     pcs_wants_kernel_sleep;
264 	bool     pcs_in_kernel_sleep;
265 
266 	struct powered_cores_state {
267 		/*
268 		 * The input into the recommendation computation from update powered cores.
269 		 */
270 		cpumap_t pcs_powerdown_recommended_cores;
271 
272 		/*
273 		 * These cores are online and are not powered down.
274 		 *
275 		 * Processors with processor->processor_online bit set.
276 		 */
277 		cpumap_t pcs_online_cores;
278 
279 		/*
280 		 * These cores are disabled or powered down
281 		 * due to temporary reasons and will come back under presented load
282 		 * so the user should still see them as active in the cpu count.
283 		 *
284 		 * Processors with processor->shutdown_temporary bit set.
285 		 */
286 		cpumap_t pcs_tempdown_cores;
287 	} pcs_effective;
288 
289 	/* The 'goal state' PCS has computed and is attempting to apply */
290 	struct powered_cores_state pcs_requested;
291 
292 	/*
293 	 * Inputs into CPU recommended cores provided by clients.
294 	 * Note that these may be changed under the available cores lock and
295 	 * become effective while sched_update_powered_cores_drops_lock is in
296 	 * the middle of making changes to CPU online state.
297 	 */
298 
299 	cpumap_t        pcs_requested_recommended_clpc;
300 	cpumap_t        pcs_requested_recommended_clpc_system;
301 	cpumap_t        pcs_requested_recommended_clpc_user;
302 	bool            pcs_recommended_clpc_failsafe_active;
303 	bool            pcs_sleep_override_recommended;
304 
305 	/*
306 	 * These cores are recommended and can be used for execution
307 	 * of non-bound threads.
308 	 *
309 	 * Processors with processor->is_recommended bit set.
310 	 */
311 	cpumap_t pcs_recommended_cores;
312 
313 	/*
314 	 * These are for the debugger.
315 	 * Use volatile to stop the compiler from optimizing out the stores
316 	 */
317 	volatile processor_reason_t pcs_in_flight_reason;
318 	volatile processor_reason_t pcs_previous_reason;
319 } pcs = {
320 	/*
321 	 * Powerdown is suspended during boot until after all CPUs finish booting,
322 	 * released by sched_cpu_init_completed.
323 	 */
324 	.pcs_powerdown_suspend_count = 1,
325 	.pcs_requested_online_user = ALL_CORES_POWERED,
326 	.pcs_requested_online_clpc_user = ALL_CORES_POWERED,
327 	.pcs_requested_online_clpc_system = ALL_CORES_POWERED,
328 	.pcs_in_flight_reason = REASON_NONE,
329 	.pcs_previous_reason = REASON_NONE,
330 	.pcs_requested.pcs_powerdown_recommended_cores = ALL_CORES_POWERED,
331 	.pcs_requested_recommended_clpc = ALL_CORES_RECOMMENDED,
332 	.pcs_requested_recommended_clpc_system = ALL_CORES_RECOMMENDED,
333 	.pcs_requested_recommended_clpc_user = ALL_CORES_RECOMMENDED,
334 };
335 
336 uint64_t sysctl_sched_recommended_cores = ALL_CORES_RECOMMENDED;
337 
338 static int sched_last_resort_cpu(void);
339 
340 static void
341 sched_update_recommended_cores_locked(
342 	processor_reason_t reason,
343 	cpumap_t core_going_offline,
344 	struct pulled_thread_queue *threadq);
345 
346 static __result_use_check struct pulled_thread_queue *
347 sched_update_powered_cores_drops_lock(
348 	processor_reason_t requested_reason,
349 	spl_t s,
350 	struct pulled_thread_queue *threadq);
351 
352 #if __arm64__
353 static void sched_recommended_cores_maintenance(void);
354 uint64_t    perfcontrol_failsafe_starvation_threshold;
355 extern char *proc_name_address(struct proc *p);
356 #endif /* __arm64__ */
357 
358 uint64_t        sched_one_second_interval;
359 boolean_t       allow_direct_handoff = TRUE;
360 
361 /* Forwards */
362 
363 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
364 
365 static void load_shift_init(void);
366 static void preempt_pri_init(void);
367 
368 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
369 
370 thread_t        processor_idle(
371 	thread_t                        thread,
372 	processor_t                     processor);
373 
374 static ast_t
375 csw_check_locked(
376 	thread_t        thread,
377 	processor_t     processor,
378 	processor_set_t pset,
379 	ast_t           check_reason);
380 
381 static void processor_setrun(
382 	processor_t                    processor,
383 	thread_t                       thread,
384 	sched_options_t                options);
385 
386 static void
387 sched_timer_deadline_tracking_init(void);
388 
389 #if     DEBUG
390 extern int debug_task;
391 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
392 #else
393 #define TLOG(a, fmt, args...) do {} while (0)
394 #endif
395 
396 static processor_t
397 thread_bind_internal(
398 	thread_t                thread,
399 	processor_t             processor);
400 
401 static void
402 sched_vm_group_maintenance(void);
403 
404 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
405 int8_t          sched_load_shifts[NRQS];
406 bitmap_t        sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
407 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
408 
409 /*
410  * Statically allocate a buffer to hold the longest possible
411  * scheduler description string, as currently implemented.
412  * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
413  * to export to userspace via sysctl(3). If either version
414  * changes, update the other.
415  *
416  * Note that in addition to being an upper bound on the strings
417  * in the kernel, it's also an exact parameter to PE_get_default(),
418  * which interrogates the device tree on some platforms. That
419  * API requires the caller know the exact size of the device tree
420  * property, so we need both a legacy size (32) and the current size
421  * (48) to deal with old and new device trees. The device tree property
422  * is similarly padded to a fixed size so that the same kernel image
423  * can run on multiple devices with different schedulers configured
424  * in the device tree.
425  */
426 char sched_string[SCHED_STRING_MAX_LENGTH];
427 
428 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
429 
430 /* Global flag which indicates whether Background Stepper Context is enabled */
431 static int cpu_throttle_enabled = 1;
432 
433 #if DEVELOPMENT || DEBUG
434 int enable_task_set_cluster_type = 0;
435 bool system_ecore_only = false;
436 #endif /* DEVELOPMENT || DEBUG */
437 
438 #if __AMP__ && (DEBUG || DEVELOPMENT)
439 static char
440 pset_cluster_type_to_name_char(pset_cluster_type_t pset_type);
441 #endif /* __AMP__ && (DEBUG || DEVELOPMENT) */
442 
443 void
sched_init(void)444 sched_init(void)
445 {
446 	boolean_t direct_handoff = FALSE;
447 	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
448 
449 	if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
450 		/* No boot-args, check in device tree */
451 		if (!PE_get_default("kern.sched_pri_decay_limit",
452 		    &sched_pri_decay_band_limit,
453 		    sizeof(sched_pri_decay_band_limit))) {
454 			/* Allow decay all the way to normal limits */
455 			sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
456 		}
457 	}
458 
459 	kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
460 
461 	if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
462 		kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
463 	}
464 	strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
465 
466 #if __arm64__
467 	clock_interval_to_absolutetime_interval(expecting_ipi_wfe_timeout_usec, NSEC_PER_USEC, &expecting_ipi_wfe_timeout_mt);
468 #endif /* __arm64__ */
469 
470 	SCHED(init)();
471 	sched_timer_deadline_tracking_init();
472 	SCHED(pset_init)(sched_boot_pset);
473 	SCHED(rt_init_pset)(sched_boot_pset);
474 	SCHED(processor_init)(master_processor);
475 
476 #if __AMP__
477 	/* By this point, the hardware topology is known. Create and initialize the other psets. */
478 	for (int cluster_id = 0; cluster_id < ml_get_cluster_count(); cluster_id++) {
479 		pset_id_t pset_id = (pset_id_t)cluster_id; /* assume 1-to-1 */
480 		if (pset_array[pset_id] != PROCESSOR_SET_NULL) {
481 			/* boot pset was initialized above. */
482 			assert3u(pset_id, ==, sched_boot_pset->pset_id);
483 			assert3p(pset_array[pset_id], ==, sched_boot_pset);
484 			continue;
485 		}
486 		assert3u(sched_boot_pset->pset_id, !=, pset_id);
487 		cluster_type_t cluster_type = ml_get_topology_info()->clusters[pset_id].cluster_type;
488 		__assert_only processor_set_t pset = pset_create(cluster_type, pset_id, pset_id);
489 		assert3p(pset, !=, PROCESSOR_SET_NULL);
490 #if (DEBUG || DEVELOPMENT)
491 		kprintf("sched_init: created pset %u [%p] with pset_cluster_type=%c\n", pset_id, pset, pset_cluster_type_to_name_char(pset->pset_cluster_type));
492 #endif /* (DEBUG || DEVELOPMENT) */
493 	}
494 #endif /* __AMP__ */
495 
496 	if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
497 		allow_direct_handoff = direct_handoff;
498 	}
499 
500 #if DEVELOPMENT || DEBUG
501 	if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
502 		system_ecore_only = (enable_task_set_cluster_type == 2);
503 	}
504 #endif /* DEVELOPMENT || DEBUG */
505 }
506 
507 void
sched_timebase_init(void)508 sched_timebase_init(void)
509 {
510 	uint64_t        abstime;
511 
512 	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
513 	sched_one_second_interval = abstime;
514 
515 	SCHED(timebase_init)();
516 	sched_realtime_timebase_init();
517 }
518 
519 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
520 
521 void
sched_timeshare_init(void)522 sched_timeshare_init(void)
523 {
524 	/*
525 	 * Calculate the timeslicing quantum
526 	 * in us.
527 	 */
528 	if (default_preemption_rate < 1) {
529 		default_preemption_rate = DEFAULT_PREEMPTION_RATE;
530 	}
531 	std_quantum_us = (1000 * 1000) / default_preemption_rate;
532 
533 	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
534 
535 	if (default_bg_preemption_rate < 1) {
536 		default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
537 	}
538 	bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
539 
540 	printf("standard background quantum is %d us\n", bg_quantum_us);
541 
542 	load_shift_init();
543 	preempt_pri_init();
544 	os_atomic_store(&sched_tick, 0, relaxed);
545 }
546 
547 void
sched_set_max_unsafe_rt_quanta(int max)548 sched_set_max_unsafe_rt_quanta(int max)
549 {
550 	const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
551 
552 	max_unsafe_rt_computation = ((uint64_t)max) * quantum_size;
553 
554 	const int mult = safe_rt_multiplier <= 0 ? 2 : safe_rt_multiplier;
555 	sched_safe_rt_duration = mult * ((uint64_t)max) * quantum_size;
556 
557 
558 #if DEVELOPMENT || DEBUG
559 	max_unsafe_rt_quanta = max;
560 #else
561 	/*
562 	 * On RELEASE kernels, this is only called on boot where
563 	 * max is already equal to max_unsafe_rt_quanta.
564 	 */
565 	assert3s(max, ==, max_unsafe_rt_quanta);
566 #endif
567 }
568 
569 void
sched_set_max_unsafe_fixed_quanta(int max)570 sched_set_max_unsafe_fixed_quanta(int max)
571 {
572 	const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
573 
574 	max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size;
575 
576 	const int mult = safe_fixed_multiplier <= 0 ? 2 : safe_fixed_multiplier;
577 	sched_safe_fixed_duration = mult * ((uint64_t)max) * quantum_size;
578 
579 #if DEVELOPMENT || DEBUG
580 	max_unsafe_fixed_quanta = max;
581 #else
582 	/*
583 	 * On RELEASE kernels, this is only called on boot where
584 	 * max is already equal to max_unsafe_fixed_quanta.
585 	 */
586 	assert3s(max, ==, max_unsafe_fixed_quanta);
587 #endif
588 }
589 
590 uint64_t
sched_get_quantum_us(void)591 sched_get_quantum_us(void)
592 {
593 	uint32_t quantum = SCHED(initial_quantum_size)(THREAD_NULL);
594 
595 	uint64_t quantum_ns;
596 	absolutetime_to_nanoseconds(quantum, &quantum_ns);
597 
598 	return quantum_ns / 1000;
599 }
600 
601 void
sched_timeshare_timebase_init(void)602 sched_timeshare_timebase_init(void)
603 {
604 	uint64_t        abstime;
605 	uint32_t        shift;
606 
607 	/* standard timeslicing quantum */
608 	clock_interval_to_absolutetime_interval(
609 		std_quantum_us, NSEC_PER_USEC, &abstime);
610 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
611 	std_quantum = (uint32_t)abstime;
612 
613 	/* smallest remaining quantum (250 us) */
614 	clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
615 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
616 	min_std_quantum = (uint32_t)abstime;
617 
618 	/* quantum for background tasks */
619 	clock_interval_to_absolutetime_interval(
620 		bg_quantum_us, NSEC_PER_USEC, &abstime);
621 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
622 	bg_quantum = (uint32_t)abstime;
623 
624 	/* scheduler tick interval */
625 	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
626 	    NSEC_PER_USEC, &abstime);
627 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
628 	sched_tick_interval = (uint32_t)abstime;
629 
630 	/* timeshare load calculation interval & deadline initialization */
631 	clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
632 	os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
633 
634 	/*
635 	 * Compute conversion factor from usage to
636 	 * timesharing priorities with 5/8 ** n aging.
637 	 */
638 	abstime = (abstime * 5) / 3;
639 	for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
640 		abstime >>= 1;
641 	}
642 	sched_fixed_shift = shift;
643 
644 	for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
645 		sched_pri_shifts[i] = INT8_MAX;
646 	}
647 
648 	sched_set_max_unsafe_rt_quanta(max_unsafe_rt_quanta);
649 	sched_set_max_unsafe_fixed_quanta(max_unsafe_fixed_quanta);
650 
651 	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
652 	thread_depress_time = 1 * std_quantum;
653 	default_timeshare_computation = std_quantum / 2;
654 	default_timeshare_constraint = std_quantum;
655 
656 #if __arm64__
657 	perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
658 #endif /* __arm64__ */
659 
660 	if (nonurgent_preemption_timer_us) {
661 		clock_interval_to_absolutetime_interval(nonurgent_preemption_timer_us, NSEC_PER_USEC, &abstime);
662 		nonurgent_preemption_timer_abs = abstime;
663 	}
664 }
665 
666 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
667 
668 void
sched_check_spill(processor_set_t pset,thread_t thread)669 sched_check_spill(processor_set_t pset, thread_t thread)
670 {
671 	(void)pset;
672 	(void)thread;
673 
674 	return;
675 }
676 
677 bool
sched_thread_should_yield(processor_t processor,thread_t thread)678 sched_thread_should_yield(processor_t processor, thread_t thread)
679 {
680 	(void)thread;
681 
682 	return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
683 }
684 
685 /* Default implementations of .steal_thread_enabled */
686 bool
sched_steal_thread_DISABLED(processor_set_t pset)687 sched_steal_thread_DISABLED(processor_set_t pset)
688 {
689 	(void)pset;
690 	return false;
691 }
692 
693 bool
sched_steal_thread_enabled(processor_set_t pset)694 sched_steal_thread_enabled(processor_set_t pset)
695 {
696 	return bit_count(pset->node->pset_map) > 1;
697 }
698 
699 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
700 
701 /*
702  * Set up values for timeshare
703  * loading factors.
704  */
705 static void
load_shift_init(void)706 load_shift_init(void)
707 {
708 	int8_t          k, *p = sched_load_shifts;
709 	uint32_t        i, j;
710 
711 	uint32_t        sched_decay_penalty = 1;
712 
713 	if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
714 		kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
715 	}
716 
717 	if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
718 		kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
719 	}
720 
721 	if (sched_decay_penalty == 0) {
722 		/*
723 		 * There is no penalty for timeshare threads for using too much
724 		 * CPU, so set all load shifts to INT8_MIN. Even under high load,
725 		 * sched_pri_shift will be >INT8_MAX, and there will be no
726 		 * penalty applied to threads (nor will sched_usage be updated per
727 		 * thread).
728 		 */
729 		for (i = 0; i < NRQS; i++) {
730 			sched_load_shifts[i] = INT8_MIN;
731 		}
732 
733 		return;
734 	}
735 
736 	*p++ = INT8_MIN; *p++ = 0;
737 
738 	/*
739 	 * For a given system load "i", the per-thread priority
740 	 * penalty per quantum of CPU usage is ~2^k priority
741 	 * levels. "sched_decay_penalty" can cause more
742 	 * array entries to be filled with smaller "k" values
743 	 */
744 	for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
745 		for (j <<= 1; (i < j) && (i < NRQS); ++i) {
746 			*p++ = k;
747 		}
748 	}
749 }
750 
751 static void
preempt_pri_init(void)752 preempt_pri_init(void)
753 {
754 	bitmap_t *p = sched_preempt_pri;
755 
756 	for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
757 		bitmap_set(p, i);
758 	}
759 
760 	for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
761 		bitmap_set(p, i);
762 	}
763 }
764 
765 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
766 
767 void
check_monotonic_time(uint64_t ctime)768 check_monotonic_time(uint64_t ctime)
769 {
770 	processor_t processor = current_processor();
771 	uint64_t last_dispatch = processor->last_dispatch;
772 
773 	if (last_dispatch > ctime) {
774 		panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
775 		    last_dispatch, ctime);
776 	}
777 }
778 
779 
780 /*
781  *	Thread wait timer expiration.
782  *	Runs in timer interrupt context with interrupts disabled.
783  */
784 void
thread_timer_expire(void * p0,__unused void * p1)785 thread_timer_expire(void *p0, __unused void *p1)
786 {
787 	thread_t thread = (thread_t)p0;
788 
789 	assert_thread_magic(thread);
790 
791 	assert(ml_get_interrupts_enabled() == FALSE);
792 
793 	thread_lock(thread);
794 
795 	if (thread->wait_timer_armed) {
796 		thread->wait_timer_armed = false;
797 		clear_wait_internal(thread, THREAD_TIMED_OUT);
798 		/* clear_wait_internal may have dropped and retaken the thread lock */
799 	}
800 
801 	thread->wait_timer_active--;
802 
803 	thread_unlock(thread);
804 }
805 
806 /*
807  *	thread_unblock:
808  *
809  *	Unblock thread on wake up.
810  *
811  *	Returns TRUE if the thread should now be placed on the runqueue.
812  *
813  *	Thread must be locked.
814  *
815  *	Called at splsched().
816  */
817 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)818 thread_unblock(
819 	thread_t                thread,
820 	wait_result_t   wresult)
821 {
822 	boolean_t               ready_for_runq = FALSE;
823 	thread_t                cthread = current_thread();
824 	uint32_t                new_run_count;
825 	int                             old_thread_state;
826 
827 	/*
828 	 *	Set wait_result.
829 	 */
830 	thread->wait_result = wresult;
831 
832 	/*
833 	 *	Cancel pending wait timer.
834 	 */
835 	if (thread->wait_timer_armed) {
836 		if (timer_call_cancel(thread->wait_timer)) {
837 			thread->wait_timer_active--;
838 		}
839 		thread->wait_timer_armed = false;
840 	}
841 
842 	boolean_t aticontext, pidle;
843 	ml_get_power_state(&aticontext, &pidle);
844 
845 	/*
846 	 *	Update scheduling state: not waiting,
847 	 *	set running.
848 	 */
849 	old_thread_state = thread->state;
850 	thread->state = (old_thread_state | TH_RUN) &
851 	    ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT | TH_WAKING);
852 
853 	if ((old_thread_state & TH_RUN) == 0) {
854 		uint64_t ctime = mach_approximate_time();
855 
856 		check_monotonic_time(ctime);
857 
858 		thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
859 		timer_start(&thread->runnable_timer, ctime);
860 
861 		ready_for_runq = TRUE;
862 
863 		if (old_thread_state & TH_WAIT_REPORT) {
864 			(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
865 		}
866 
867 		/* Update the runnable thread count */
868 		new_run_count = SCHED(run_count_incr)(thread);
869 
870 #if CONFIG_SCHED_AUTO_JOIN
871 		if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
872 			work_interval_auto_join_propagate(cthread, thread);
873 		}
874 #endif /*CONFIG_SCHED_AUTO_JOIN */
875 
876 	} else {
877 		/*
878 		 * Either the thread is idling in place on another processor,
879 		 * or it hasn't finished context switching yet.
880 		 */
881 		assert((thread->state & TH_IDLE) == 0);
882 		/*
883 		 * The run count is only dropped after the context switch completes
884 		 * and the thread is still waiting, so we should not run_incr here
885 		 */
886 		new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
887 	}
888 
889 	/*
890 	 * Calculate deadline for real-time threads.
891 	 */
892 	if (thread->sched_mode == TH_MODE_REALTIME) {
893 		uint64_t ctime = mach_absolute_time();
894 		thread->realtime.deadline = thread->realtime.constraint + ctime;
895 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
896 		    (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
897 	}
898 
899 	/*
900 	 * Clear old quantum, fail-safe computation, etc.
901 	 */
902 	thread->quantum_remaining = 0;
903 	thread->computation_metered = 0;
904 	thread->reason = AST_NONE;
905 	thread->block_hint = kThreadWaitNone;
906 
907 	/* Obtain power-relevant interrupt and "platform-idle exit" statistics.
908 	 * We also account for "double hop" thread signaling via
909 	 * the thread callout infrastructure.
910 	 * DRK: consider removing the callout wakeup counters in the future
911 	 * they're present for verification at the moment.
912 	 */
913 
914 	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
915 		DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
916 
917 		uint64_t ttd = current_processor()->timer_call_ttd;
918 
919 		if (ttd) {
920 			if (ttd <= timer_deadline_tracking_bin_1) {
921 				thread->thread_timer_wakeups_bin_1++;
922 			} else if (ttd <= timer_deadline_tracking_bin_2) {
923 				thread->thread_timer_wakeups_bin_2++;
924 			}
925 		}
926 
927 		ledger_credit_thread(thread, thread->t_ledger,
928 		    task_ledgers.interrupt_wakeups, 1);
929 		if (pidle) {
930 			ledger_credit_thread(thread, thread->t_ledger,
931 			    task_ledgers.platform_idle_wakeups, 1);
932 		}
933 	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
934 		/* TODO: what about an interrupt that does a wake taken on a callout thread? */
935 		if (cthread->callout_woken_from_icontext) {
936 			ledger_credit_thread(thread, thread->t_ledger,
937 			    task_ledgers.interrupt_wakeups, 1);
938 			thread->thread_callout_interrupt_wakeups++;
939 
940 			if (cthread->callout_woken_from_platform_idle) {
941 				ledger_credit_thread(thread, thread->t_ledger,
942 				    task_ledgers.platform_idle_wakeups, 1);
943 				thread->thread_callout_platform_idle_wakeups++;
944 			}
945 
946 			cthread->callout_woke_thread = TRUE;
947 		}
948 	}
949 
950 	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
951 		thread->callout_woken_from_icontext = !!aticontext;
952 		thread->callout_woken_from_platform_idle = !!pidle;
953 		thread->callout_woke_thread = FALSE;
954 	}
955 
956 #if KPERF
957 	if (ready_for_runq) {
958 		kperf_make_runnable(thread, aticontext);
959 	}
960 #endif /* KPERF */
961 
962 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
963 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
964 	    (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
965 	    sched_run_buckets[TH_BUCKET_RUN], 0);
966 
967 	DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
968 
969 	return ready_for_runq;
970 }
971 
972 /*
973  *	Routine:	thread_allowed_for_handoff
974  *	Purpose:
975  *		Check if the thread is allowed for handoff operation
976  *	Conditions:
977  *		thread lock held, IPC locks may be held.
978  *	TODO: In future, do not allow handoff if threads have different cluster
979  *	recommendations.
980  */
981 boolean_t
thread_allowed_for_handoff(thread_t thread)982 thread_allowed_for_handoff(
983 	thread_t         thread)
984 {
985 	thread_t self = current_thread();
986 
987 	if (allow_direct_handoff &&
988 	    thread->sched_mode == TH_MODE_REALTIME &&
989 	    self->sched_mode == TH_MODE_REALTIME) {
990 		return TRUE;
991 	}
992 
993 	return FALSE;
994 }
995 
996 /*
997  *	Routine:	thread_go
998  *	Purpose:
999  *		Unblock and dispatch thread.
1000  *	Conditions:
1001  *		thread lock held, IPC locks may be held.
1002  *		thread must have been waiting
1003  */
1004 void
thread_go(thread_t thread,wait_result_t wresult,bool try_handoff)1005 thread_go(
1006 	thread_t                thread,
1007 	wait_result_t           wresult,
1008 	bool                    try_handoff)
1009 {
1010 	thread_t self = current_thread();
1011 
1012 	assert_thread_magic(thread);
1013 
1014 	assert(thread->at_safe_point == FALSE);
1015 	assert(thread->wait_event == NO_EVENT64);
1016 	assert(waitq_is_null(thread->waitq));
1017 
1018 	assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1019 	assert(thread->state & TH_WAIT);
1020 
1021 	if (thread->started) {
1022 		assert(thread->state & TH_WAKING);
1023 	}
1024 
1025 	thread_lock_assert(thread, LCK_ASSERT_OWNED);
1026 
1027 	assert(ml_get_interrupts_enabled() == false);
1028 
1029 	if (thread_unblock(thread, wresult)) {
1030 #if SCHED_TRACE_THREAD_WAKEUPS
1031 		backtrace(&thread->thread_wakeup_bt[0],
1032 		    (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1033 		    NULL);
1034 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1035 		if (try_handoff && thread_allowed_for_handoff(thread)) {
1036 			thread_reference(thread);
1037 			assert(self->handoff_thread == NULL);
1038 			self->handoff_thread = thread;
1039 
1040 			/*
1041 			 * A TH_RUN'ed thread must have a chosen_processor.
1042 			 * thread_setrun would have set it, so we need to
1043 			 * replicate that here.
1044 			 */
1045 			thread->chosen_processor = current_processor();
1046 		} else {
1047 			thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1048 		}
1049 	}
1050 }
1051 
1052 /*
1053  *	Routine:	thread_mark_wait_locked
1054  *	Purpose:
1055  *		Mark a thread as waiting.  If, given the circumstances,
1056  *		it doesn't want to wait (i.e. already aborted), then
1057  *		indicate that in the return value.
1058  *	Conditions:
1059  *		at splsched() and thread is locked.
1060  */
1061 __private_extern__
1062 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1063 thread_mark_wait_locked(
1064 	thread_t                        thread,
1065 	wait_interrupt_t        interruptible_orig)
1066 {
1067 	boolean_t                       at_safe_point;
1068 	wait_interrupt_t        interruptible = interruptible_orig;
1069 
1070 	if (thread->state & TH_IDLE) {
1071 		panic("Invalid attempt to wait while running the idle thread");
1072 	}
1073 
1074 	assert(!(thread->state & (TH_WAIT | TH_WAKING | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1075 
1076 	/*
1077 	 *	The thread may have certain types of interrupts/aborts masked
1078 	 *	off.  Even if the wait location says these types of interrupts
1079 	 *	are OK, we have to honor mask settings (outer-scoped code may
1080 	 *	not be able to handle aborts at the moment).
1081 	 */
1082 	interruptible &= TH_OPT_INTMASK;
1083 	if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1084 		interruptible = thread->options & TH_OPT_INTMASK;
1085 	}
1086 
1087 	at_safe_point = (interruptible == THREAD_ABORTSAFE);
1088 
1089 	if (interruptible == THREAD_UNINT ||
1090 	    !(thread->sched_flags & TH_SFLAG_ABORT) ||
1091 	    (!at_safe_point &&
1092 	    (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1093 		if (!(thread->state & TH_TERMINATE)) {
1094 			DTRACE_SCHED(sleep);
1095 		}
1096 
1097 		int state_bits = TH_WAIT;
1098 		if (!interruptible) {
1099 			state_bits |= TH_UNINT;
1100 		}
1101 		if (thread->sched_call) {
1102 			wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1103 			if (is_kerneltask(get_threadtask(thread))) {
1104 				mask = THREAD_WAIT_NOREPORT_KERNEL;
1105 			}
1106 			if ((interruptible_orig & mask) == 0) {
1107 				state_bits |= TH_WAIT_REPORT;
1108 			}
1109 		}
1110 		thread->state |= state_bits;
1111 		thread->at_safe_point = at_safe_point;
1112 
1113 		/* TODO: pass this through assert_wait instead, have
1114 		 * assert_wait just take a struct as an argument */
1115 		assert(!thread->block_hint);
1116 		thread->block_hint = thread->pending_block_hint;
1117 		thread->pending_block_hint = kThreadWaitNone;
1118 
1119 		return thread->wait_result = THREAD_WAITING;
1120 	} else {
1121 		if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1122 			thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1123 		}
1124 	}
1125 	thread->pending_block_hint = kThreadWaitNone;
1126 
1127 	return thread->wait_result = THREAD_INTERRUPTED;
1128 }
1129 
1130 /*
1131  *	Routine:	thread_interrupt_level
1132  *	Purpose:
1133  *	        Set the maximum interruptible state for the
1134  *		current thread.  The effective value of any
1135  *		interruptible flag passed into assert_wait
1136  *		will never exceed this.
1137  *
1138  *		Useful for code that must not be interrupted,
1139  *		but which calls code that doesn't know that.
1140  *	Returns:
1141  *		The old interrupt level for the thread.
1142  */
1143 __private_extern__
1144 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1145 thread_interrupt_level(
1146 	wait_interrupt_t new_level)
1147 {
1148 	thread_t thread = current_thread();
1149 	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1150 
1151 	thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1152 
1153 	return result;
1154 }
1155 
1156 /*
1157  *	assert_wait:
1158  *
1159  *	Assert that the current thread is about to go to
1160  *	sleep until the specified event occurs.
1161  */
1162 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1163 assert_wait(
1164 	event_t                         event,
1165 	wait_interrupt_t        interruptible)
1166 {
1167 	if (__improbable(event == NO_EVENT)) {
1168 		panic("%s() called with NO_EVENT", __func__);
1169 	}
1170 
1171 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1172 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1173 	    VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1174 
1175 	struct waitq *waitq;
1176 	waitq = global_eventq(event);
1177 	return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1178 }
1179 
1180 /*
1181  *	assert_wait_queue:
1182  *
1183  *	Return the global waitq for the specified event
1184  */
1185 struct waitq *
assert_wait_queue(event_t event)1186 assert_wait_queue(
1187 	event_t                         event)
1188 {
1189 	return global_eventq(event);
1190 }
1191 
1192 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1193 assert_wait_timeout(
1194 	event_t                         event,
1195 	wait_interrupt_t        interruptible,
1196 	uint32_t                        interval,
1197 	uint32_t                        scale_factor)
1198 {
1199 	thread_t                        thread = current_thread();
1200 	wait_result_t           wresult;
1201 	uint64_t                        deadline;
1202 	spl_t                           s;
1203 
1204 	if (__improbable(event == NO_EVENT)) {
1205 		panic("%s() called with NO_EVENT", __func__);
1206 	}
1207 
1208 	struct waitq *waitq;
1209 	waitq = global_eventq(event);
1210 
1211 	s = splsched();
1212 	waitq_lock(waitq);
1213 
1214 	clock_interval_to_deadline(interval, scale_factor, &deadline);
1215 
1216 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1217 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1218 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1219 
1220 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1221 	    interruptible,
1222 	    TIMEOUT_URGENCY_SYS_NORMAL,
1223 	    deadline, TIMEOUT_NO_LEEWAY,
1224 	    thread);
1225 
1226 	waitq_unlock(waitq);
1227 	splx(s);
1228 	return wresult;
1229 }
1230 
1231 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1232 assert_wait_timeout_with_leeway(
1233 	event_t                         event,
1234 	wait_interrupt_t        interruptible,
1235 	wait_timeout_urgency_t  urgency,
1236 	uint32_t                        interval,
1237 	uint32_t                        leeway,
1238 	uint32_t                        scale_factor)
1239 {
1240 	thread_t                        thread = current_thread();
1241 	wait_result_t           wresult;
1242 	uint64_t                        deadline;
1243 	uint64_t                        abstime;
1244 	uint64_t                        slop;
1245 	uint64_t                        now;
1246 	spl_t                           s;
1247 
1248 	if (__improbable(event == NO_EVENT)) {
1249 		panic("%s() called with NO_EVENT", __func__);
1250 	}
1251 
1252 	now = mach_absolute_time();
1253 	clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1254 	deadline = now + abstime;
1255 
1256 	clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1257 
1258 	struct waitq *waitq;
1259 	waitq = global_eventq(event);
1260 
1261 	s = splsched();
1262 	waitq_lock(waitq);
1263 
1264 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1265 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1266 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1267 
1268 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1269 	    interruptible,
1270 	    urgency, deadline, slop,
1271 	    thread);
1272 
1273 	waitq_unlock(waitq);
1274 	splx(s);
1275 	return wresult;
1276 }
1277 
1278 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1279 assert_wait_deadline(
1280 	event_t                         event,
1281 	wait_interrupt_t        interruptible,
1282 	uint64_t                        deadline)
1283 {
1284 	thread_t                        thread = current_thread();
1285 	wait_result_t           wresult;
1286 	spl_t                           s;
1287 
1288 	if (__improbable(event == NO_EVENT)) {
1289 		panic("%s() called with NO_EVENT", __func__);
1290 	}
1291 
1292 	struct waitq *waitq;
1293 	waitq = global_eventq(event);
1294 
1295 	s = splsched();
1296 	waitq_lock(waitq);
1297 
1298 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1299 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1300 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1301 
1302 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1303 	    interruptible,
1304 	    TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1305 	    TIMEOUT_NO_LEEWAY, thread);
1306 	waitq_unlock(waitq);
1307 	splx(s);
1308 	return wresult;
1309 }
1310 
1311 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1312 assert_wait_deadline_with_leeway(
1313 	event_t                         event,
1314 	wait_interrupt_t        interruptible,
1315 	wait_timeout_urgency_t  urgency,
1316 	uint64_t                        deadline,
1317 	uint64_t                        leeway)
1318 {
1319 	thread_t                        thread = current_thread();
1320 	wait_result_t           wresult;
1321 	spl_t                           s;
1322 
1323 	if (__improbable(event == NO_EVENT)) {
1324 		panic("%s() called with NO_EVENT", __func__);
1325 	}
1326 
1327 	struct waitq *waitq;
1328 	waitq = global_eventq(event);
1329 
1330 	s = splsched();
1331 	waitq_lock(waitq);
1332 
1333 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1334 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1335 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1336 
1337 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1338 	    interruptible,
1339 	    urgency, deadline, leeway,
1340 	    thread);
1341 	waitq_unlock(waitq);
1342 	splx(s);
1343 	return wresult;
1344 }
1345 
1346 void
sched_cond_init(sched_cond_atomic_t * cond)1347 sched_cond_init(
1348 	sched_cond_atomic_t *cond)
1349 {
1350 	os_atomic_init(cond, SCHED_COND_INIT);
1351 }
1352 
1353 wait_result_t
sched_cond_wait_parameter(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation,void * parameter)1354 sched_cond_wait_parameter(
1355 	sched_cond_atomic_t *cond,
1356 	wait_interrupt_t interruptible,
1357 	thread_continue_t continuation,
1358 	void *parameter)
1359 {
1360 	assert_wait((event_t) cond, interruptible);
1361 	/* clear active bit to indicate future wakeups will have to unblock this thread */
1362 	sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed);
1363 	if (__improbable(new_state & SCHED_COND_WAKEUP)) {
1364 		/* a wakeup has been issued; undo wait assertion, ack the wakeup, and return */
1365 		thread_t thread = current_thread();
1366 		clear_wait(thread, THREAD_AWAKENED);
1367 		sched_cond_ack(cond);
1368 		return THREAD_AWAKENED;
1369 	}
1370 	return thread_block_parameter(continuation, parameter);
1371 }
1372 
1373 wait_result_t
sched_cond_wait(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation)1374 sched_cond_wait(
1375 	sched_cond_atomic_t *cond,
1376 	wait_interrupt_t interruptible,
1377 	thread_continue_t continuation)
1378 {
1379 	return sched_cond_wait_parameter(cond, interruptible, continuation, NULL);
1380 }
1381 
1382 sched_cond_t
sched_cond_ack(sched_cond_atomic_t * cond)1383 sched_cond_ack(
1384 	sched_cond_atomic_t *cond)
1385 {
1386 	sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE | SCHED_COND_WAKEUP, acquire);
1387 	assert(new_cond & SCHED_COND_ACTIVE);
1388 	return new_cond;
1389 }
1390 
1391 kern_return_t
sched_cond_signal(sched_cond_atomic_t * cond,thread_t thread)1392 sched_cond_signal(
1393 	sched_cond_atomic_t  *cond,
1394 	thread_t thread)
1395 {
1396 	disable_preemption();
1397 	sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release);
1398 	if (!(old_cond & (SCHED_COND_WAKEUP | SCHED_COND_ACTIVE))) {
1399 		/* this was the first wakeup to be issued AND the thread was inactive */
1400 		thread_wakeup_thread((event_t) cond, thread);
1401 	}
1402 	enable_preemption();
1403 	return KERN_SUCCESS;
1404 }
1405 
1406 /*
1407  * thread_isoncpu:
1408  *
1409  * Return TRUE if a thread is running on a processor such that an AST
1410  * is needed to pull it out of userspace execution, or if executing in
1411  * the kernel, bring to a context switch boundary that would cause
1412  * thread state to be serialized in the thread PCB.
1413  *
1414  * Thread locked, returns the same way. While locked, fields
1415  * like "state" cannot change. "runq" can change only from set to unset.
1416  */
1417 static inline boolean_t
thread_isoncpu(thread_t thread)1418 thread_isoncpu(thread_t thread)
1419 {
1420 	/* Not running or runnable */
1421 	if (!(thread->state & TH_RUN)) {
1422 		return FALSE;
1423 	}
1424 
1425 	/* Waiting on a runqueue, not currently running */
1426 	/* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1427 	/* TODO: This can also be incorrect for `handoff` cases where
1428 	 * the thread is never enqueued on the runq */
1429 	if (thread_get_runq(thread) != PROCESSOR_NULL) {
1430 		return FALSE;
1431 	}
1432 
1433 	/*
1434 	 * Thread does not have a stack yet
1435 	 * It could be on the stack alloc queue or preparing to be invoked
1436 	 */
1437 	if (!thread->kernel_stack) {
1438 		return FALSE;
1439 	}
1440 
1441 	/*
1442 	 * Thread must be running on a processor, or
1443 	 * about to run, or just did run. In all these
1444 	 * cases, an AST to the processor is needed
1445 	 * to guarantee that the thread is kicked out
1446 	 * of userspace and the processor has
1447 	 * context switched (and saved register state).
1448 	 */
1449 	return TRUE;
1450 }
1451 
1452 /*
1453  * thread_stop:
1454  *
1455  * Force a preemption point for a thread and wait
1456  * for it to stop running on a CPU. If a stronger
1457  * guarantee is requested, wait until no longer
1458  * runnable. Arbitrates access among
1459  * multiple stop requests. (released by unstop)
1460  *
1461  * The thread must enter a wait state and stop via a
1462  * separate means.
1463  *
1464  * Returns FALSE if interrupted.
1465  */
1466 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1467 thread_stop(
1468 	thread_t                thread,
1469 	boolean_t       until_not_runnable)
1470 {
1471 	wait_result_t   wresult;
1472 	spl_t                   s = splsched();
1473 	boolean_t               oncpu;
1474 
1475 	wake_lock(thread);
1476 	thread_lock(thread);
1477 
1478 	while (thread->state & TH_SUSP) {
1479 		thread->wake_active = TRUE;
1480 		thread_unlock(thread);
1481 
1482 		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1483 		wake_unlock(thread);
1484 		splx(s);
1485 
1486 		if (wresult == THREAD_WAITING) {
1487 			wresult = thread_block(THREAD_CONTINUE_NULL);
1488 		}
1489 
1490 		if (wresult != THREAD_AWAKENED) {
1491 			return FALSE;
1492 		}
1493 
1494 		s = splsched();
1495 		wake_lock(thread);
1496 		thread_lock(thread);
1497 	}
1498 
1499 	thread->state |= TH_SUSP;
1500 
1501 	while ((oncpu = thread_isoncpu(thread)) ||
1502 	    (until_not_runnable && (thread->state & TH_RUN))) {
1503 		if (oncpu) {
1504 			/*
1505 			 * TODO: chosen_processor isn't really the right
1506 			 * thing to IPI here.  We really want `last_processor`,
1507 			 * but we also want to know where to send the IPI
1508 			 * *before* thread_invoke sets last_processor.
1509 			 *
1510 			 * rdar://47149497 (thread_stop doesn't IPI the right core)
1511 			 */
1512 			assert(thread->state & TH_RUN);
1513 			processor_t processor = thread->chosen_processor;
1514 			assert(processor != PROCESSOR_NULL);
1515 			cause_ast_check(processor);
1516 		}
1517 
1518 		thread->wake_active = TRUE;
1519 		thread_unlock(thread);
1520 
1521 		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1522 		wake_unlock(thread);
1523 		splx(s);
1524 
1525 		if (wresult == THREAD_WAITING) {
1526 			wresult = thread_block(THREAD_CONTINUE_NULL);
1527 		}
1528 
1529 		if (wresult != THREAD_AWAKENED) {
1530 			thread_unstop(thread);
1531 			return FALSE;
1532 		}
1533 
1534 		s = splsched();
1535 		wake_lock(thread);
1536 		thread_lock(thread);
1537 	}
1538 
1539 	thread_unlock(thread);
1540 	wake_unlock(thread);
1541 	splx(s);
1542 
1543 	/*
1544 	 * We return with the thread unlocked. To prevent it from
1545 	 * transitioning to a runnable state (or from TH_RUN to
1546 	 * being on the CPU), the caller must ensure the thread
1547 	 * is stopped via an external means (such as an AST)
1548 	 */
1549 
1550 	return TRUE;
1551 }
1552 
1553 /*
1554  * thread_unstop:
1555  *
1556  * Release a previous stop request and set
1557  * the thread running if appropriate.
1558  *
1559  * Use only after a successful stop operation.
1560  */
1561 void
thread_unstop(thread_t thread)1562 thread_unstop(
1563 	thread_t        thread)
1564 {
1565 	spl_t           s = splsched();
1566 
1567 	wake_lock(thread);
1568 	thread_lock(thread);
1569 
1570 	assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1571 
1572 	if (thread->state & TH_SUSP) {
1573 		thread->state &= ~TH_SUSP;
1574 
1575 		if (thread->wake_active) {
1576 			thread->wake_active = FALSE;
1577 			thread_unlock(thread);
1578 
1579 			thread_wakeup(&thread->wake_active);
1580 			wake_unlock(thread);
1581 			splx(s);
1582 
1583 			return;
1584 		}
1585 	}
1586 
1587 	thread_unlock(thread);
1588 	wake_unlock(thread);
1589 	splx(s);
1590 }
1591 
1592 /*
1593  * thread_wait:
1594  *
1595  * Wait for a thread to stop running. (non-interruptible)
1596  *
1597  */
1598 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1599 thread_wait(
1600 	thread_t        thread,
1601 	boolean_t       until_not_runnable)
1602 {
1603 	wait_result_t   wresult;
1604 	boolean_t       oncpu;
1605 	processor_t     processor;
1606 	spl_t           s = splsched();
1607 
1608 	wake_lock(thread);
1609 	thread_lock(thread);
1610 
1611 	/*
1612 	 * Wait until not running on a CPU.  If stronger requirement
1613 	 * desired, wait until not runnable.  Assumption: if thread is
1614 	 * on CPU, then TH_RUN is set, so we're not waiting in any case
1615 	 * where the original, pure "TH_RUN" check would have let us
1616 	 * finish.
1617 	 */
1618 	while ((oncpu = thread_isoncpu(thread)) ||
1619 	    (until_not_runnable && (thread->state & TH_RUN))) {
1620 		if (oncpu) {
1621 			assert(thread->state & TH_RUN);
1622 			processor = thread->chosen_processor;
1623 			cause_ast_check(processor);
1624 		}
1625 
1626 		thread->wake_active = TRUE;
1627 		thread_unlock(thread);
1628 
1629 		wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1630 		wake_unlock(thread);
1631 		splx(s);
1632 
1633 		if (wresult == THREAD_WAITING) {
1634 			thread_block(THREAD_CONTINUE_NULL);
1635 		}
1636 
1637 		s = splsched();
1638 		wake_lock(thread);
1639 		thread_lock(thread);
1640 	}
1641 
1642 	thread_unlock(thread);
1643 	wake_unlock(thread);
1644 	splx(s);
1645 }
1646 
1647 /*
1648  *	Routine: clear_wait_internal
1649  *
1650  *		Clear the wait condition for the specified thread.
1651  *		Start the thread executing if that is appropriate.
1652  *	Arguments:
1653  *		thread		thread to awaken
1654  *		result		Wakeup result the thread should see
1655  *	Conditions:
1656  *		At splsched
1657  *		the thread is locked.
1658  *	Returns:
1659  *		KERN_SUCCESS		thread was rousted out a wait
1660  *		KERN_FAILURE		thread was waiting but could not be rousted
1661  *		KERN_NOT_WAITING	thread was not waiting
1662  */
1663 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1664 clear_wait_internal(
1665 	thread_t        thread,
1666 	wait_result_t   wresult)
1667 {
1668 	waitq_t waitq = thread->waitq;
1669 
1670 	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1671 		return KERN_FAILURE;
1672 	}
1673 
1674 	/*
1675 	 * Check that the thread is waiting and not waking, as a waking thread
1676 	 * has already cleared its waitq, and is destined to be go'ed, don't
1677 	 * need to do it again.
1678 	 */
1679 	if ((thread->state & (TH_WAIT | TH_TERMINATE | TH_WAKING)) != TH_WAIT) {
1680 		assert(waitq_is_null(thread->waitq));
1681 		return KERN_NOT_WAITING;
1682 	}
1683 
1684 	/* may drop and retake the thread lock */
1685 	if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1686 		return KERN_NOT_WAITING;
1687 	}
1688 
1689 	thread_go(thread, wresult, /* handoff */ false);
1690 
1691 	return KERN_SUCCESS;
1692 }
1693 
1694 
1695 /*
1696  *	clear_wait:
1697  *
1698  *	Clear the wait condition for the specified thread.  Start the thread
1699  *	executing if that is appropriate.
1700  *
1701  *	parameters:
1702  *	  thread		thread to awaken
1703  *	  result		Wakeup result the thread should see
1704  */
1705 __mockable kern_return_t
clear_wait(thread_t thread,wait_result_t result)1706 clear_wait(
1707 	thread_t                thread,
1708 	wait_result_t   result)
1709 {
1710 	kern_return_t ret;
1711 	spl_t           s;
1712 
1713 	s = splsched();
1714 	thread_lock(thread);
1715 
1716 	ret = clear_wait_internal(thread, result);
1717 
1718 	if (thread == current_thread()) {
1719 		/*
1720 		 * The thread must be ready to wait again immediately
1721 		 * after clearing its own wait.
1722 		 */
1723 		assert((thread->state & TH_WAKING) == 0);
1724 	}
1725 
1726 	thread_unlock(thread);
1727 	splx(s);
1728 	return ret;
1729 }
1730 
1731 /*
1732  *	thread_wakeup_prim:
1733  *
1734  *	Common routine for thread_wakeup, thread_wakeup_with_result,
1735  *	and thread_wakeup_one.
1736  *
1737  */
1738 kern_return_t
thread_wakeup_nthreads_prim(event_t event,uint32_t nthreads,wait_result_t result)1739 thread_wakeup_nthreads_prim(
1740 	event_t          event,
1741 	uint32_t         nthreads,
1742 	wait_result_t    result)
1743 {
1744 	if (__improbable(event == NO_EVENT)) {
1745 		panic("%s() called with NO_EVENT", __func__);
1746 	}
1747 
1748 	struct waitq *wq = global_eventq(event);
1749 	uint32_t count;
1750 
1751 	count = waitq_wakeup64_nthreads(wq, CAST_EVENT64_T(event), result,
1752 	    WAITQ_WAKEUP_DEFAULT, nthreads);
1753 	return count ? KERN_SUCCESS : KERN_NOT_WAITING;
1754 }
1755 
1756 /*
1757  *	thread_wakeup_prim:
1758  *
1759  *	Common routine for thread_wakeup, thread_wakeup_with_result,
1760  *	and thread_wakeup_one.
1761  *
1762  */
1763 __mockable kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1764 thread_wakeup_prim(
1765 	event_t          event,
1766 	boolean_t        one_thread,
1767 	wait_result_t    result)
1768 {
1769 	if (one_thread) {
1770 		return thread_wakeup_nthreads_prim(event, 1, result);
1771 	} else {
1772 		return thread_wakeup_nthreads_prim(event, UINT32_MAX, result);
1773 	}
1774 }
1775 
1776 /*
1777  * Wakeup a specified thread if and only if it's waiting for this event
1778  */
1779 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1780 thread_wakeup_thread(
1781 	event_t         event,
1782 	thread_t        thread)
1783 {
1784 	if (__improbable(event == NO_EVENT)) {
1785 		panic("%s() called with NO_EVENT", __func__);
1786 	}
1787 
1788 	if (__improbable(thread == THREAD_NULL)) {
1789 		panic("%s() called with THREAD_NULL", __func__);
1790 	}
1791 
1792 	struct waitq *wq = global_eventq(event);
1793 
1794 	return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1795 }
1796 
1797 /*
1798  *	thread_bind:
1799  *
1800  *	Force the current thread to execute on the specified processor.
1801  *	Takes effect after the next thread_block().
1802  *
1803  *	Returns the previous binding.  PROCESSOR_NULL means
1804  *	not bound.
1805  *
1806  *	XXX - DO NOT export this to users - XXX
1807  */
1808 processor_t
thread_bind(processor_t processor)1809 thread_bind(
1810 	processor_t             processor)
1811 {
1812 	thread_t                self = current_thread();
1813 	processor_t             prev;
1814 	spl_t                   s;
1815 
1816 	s = splsched();
1817 	thread_lock(self);
1818 
1819 	prev = thread_bind_internal(self, processor);
1820 
1821 	thread_unlock(self);
1822 	splx(s);
1823 
1824 	return prev;
1825 }
1826 
1827 void
thread_bind_during_wakeup(thread_t thread,processor_t processor)1828 thread_bind_during_wakeup(thread_t thread, processor_t processor)
1829 {
1830 	assert(!ml_get_interrupts_enabled());
1831 	assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1832 #if MACH_ASSERT
1833 	thread_lock_assert(thread, LCK_ASSERT_OWNED);
1834 #endif
1835 
1836 	if (thread->bound_processor != processor) {
1837 		thread_bind_internal(thread, processor);
1838 	}
1839 }
1840 
1841 void
thread_unbind_after_queue_shutdown(thread_t thread,processor_t processor __assert_only)1842 thread_unbind_after_queue_shutdown(
1843 	thread_t                thread,
1844 	processor_t             processor __assert_only)
1845 {
1846 	assert(!ml_get_interrupts_enabled());
1847 
1848 	thread_lock(thread);
1849 
1850 	if (thread->bound_processor) {
1851 		bool removed;
1852 
1853 		assert(thread->bound_processor == processor);
1854 
1855 		removed = thread_run_queue_remove(thread);
1856 		/*
1857 		 * we can always unbind even if we didn't really remove the
1858 		 * thread from the runqueue
1859 		 */
1860 		thread_bind_internal(thread, PROCESSOR_NULL);
1861 		if (removed) {
1862 			thread_run_queue_reinsert(thread, SCHED_TAILQ);
1863 		}
1864 	}
1865 
1866 	thread_unlock(thread);
1867 }
1868 
1869 /*
1870  * thread_bind_internal:
1871  *
1872  * If the specified thread is not the current thread, and it is currently
1873  * running on another CPU, a remote AST must be sent to that CPU to cause
1874  * the thread to migrate to its bound processor. Otherwise, the migration
1875  * will occur at the next quantum expiration or blocking point.
1876  *
1877  * When the thread is the current thread, and explicit thread_block() should
1878  * be used to force the current processor to context switch away and
1879  * let the thread migrate to the bound processor.
1880  *
1881  * Thread must be locked, and at splsched.
1882  */
1883 
1884 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)1885 thread_bind_internal(
1886 	thread_t                thread,
1887 	processor_t             processor)
1888 {
1889 	processor_t             prev;
1890 
1891 	/* <rdar://problem/15102234> */
1892 	assert(thread->sched_pri < BASEPRI_RTQUEUES);
1893 	/* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1894 	thread_assert_runq_null(thread);
1895 
1896 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND),
1897 	    thread_tid(thread), processor ? processor->cpu_id : ~0ul, 0, 0, 0);
1898 
1899 	prev = thread->bound_processor;
1900 	thread->bound_processor = processor;
1901 
1902 	return prev;
1903 }
1904 
1905 /*
1906  * thread_vm_bind_group_add:
1907  *
1908  * The "VM bind group" is a special mechanism to mark a collection
1909  * of threads from the VM subsystem that, in general, should be scheduled
1910  * with only one CPU of parallelism. To accomplish this, we initially
1911  * bind all the threads to the master processor, which has the effect
1912  * that only one of the threads in the group can execute at once, including
1913  * preempting threads in the group that are a lower priority. Future
1914  * mechanisms may use more dynamic mechanisms to prevent the collection
1915  * of VM threads from using more CPU time than desired.
1916  *
1917  * The current implementation can result in priority inversions where
1918  * compute-bound priority 95 or realtime threads that happen to have
1919  * landed on the master processor prevent the VM threads from running.
1920  * When this situation is detected, we unbind the threads for one
1921  * scheduler tick to allow the scheduler to run the threads an
1922  * additional CPUs, before restoring the binding (assuming high latency
1923  * is no longer a problem).
1924  */
1925 
1926 /*
1927  * The current max is provisioned for:
1928  * vm_compressor_swap_trigger_thread (92)
1929  * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1930  * vm_pageout_continue (92)
1931  * memorystatus_thread (95)
1932  */
1933 #define MAX_VM_BIND_GROUP_COUNT (5)
1934 decl_simple_lock_data(static, sched_vm_group_list_lock);
1935 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1936 static int sched_vm_group_thread_count;
1937 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1938 
1939 void
thread_vm_bind_group_add(void)1940 thread_vm_bind_group_add(void)
1941 {
1942 	thread_t self = current_thread();
1943 
1944 	if (support_bootcpu_shutdown) {
1945 		/*
1946 		 * Bind group is not supported without an always-on
1947 		 * processor to bind to. If we need these to coexist,
1948 		 * we'd need to dynamically move the group to
1949 		 * another processor as it shuts down, or build
1950 		 * a different way to run a set of threads
1951 		 * without parallelism.
1952 		 */
1953 		return;
1954 	}
1955 
1956 	thread_reference(self);
1957 	self->options |= TH_OPT_SCHED_VM_GROUP;
1958 
1959 	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1960 	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1961 	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1962 	simple_unlock(&sched_vm_group_list_lock);
1963 
1964 	thread_bind(master_processor);
1965 
1966 	/* Switch to bound processor if not already there */
1967 	thread_block(THREAD_CONTINUE_NULL);
1968 }
1969 
1970 static void
sched_vm_group_maintenance(void)1971 sched_vm_group_maintenance(void)
1972 {
1973 	uint64_t ctime = mach_absolute_time();
1974 	uint64_t longtime = ctime - sched_tick_interval;
1975 	int i;
1976 	spl_t s;
1977 	boolean_t high_latency_observed = FALSE;
1978 	boolean_t runnable_and_not_on_runq_observed = FALSE;
1979 	boolean_t bind_target_changed = FALSE;
1980 	processor_t bind_target = PROCESSOR_NULL;
1981 
1982 	/* Make sure nobody attempts to add new threads while we are enumerating them */
1983 	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1984 
1985 	s = splsched();
1986 
1987 	for (i = 0; i < sched_vm_group_thread_count; i++) {
1988 		thread_t thread = sched_vm_group_thread_list[i];
1989 		assert(thread != THREAD_NULL);
1990 		thread_lock(thread);
1991 		if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
1992 			if (thread_get_runq(thread) != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1993 				high_latency_observed = TRUE;
1994 			} else if (thread_get_runq(thread) == PROCESSOR_NULL) {
1995 				/* There are some cases where a thread be transitiong that also fall into this case */
1996 				runnable_and_not_on_runq_observed = TRUE;
1997 			}
1998 		}
1999 		thread_unlock(thread);
2000 
2001 		if (high_latency_observed && runnable_and_not_on_runq_observed) {
2002 			/* All the things we are looking for are true, stop looking */
2003 			break;
2004 		}
2005 	}
2006 
2007 	splx(s);
2008 
2009 	if (sched_vm_group_temporarily_unbound) {
2010 		/* If we turned off binding, make sure everything is OK before rebinding */
2011 		if (!high_latency_observed) {
2012 			/* rebind */
2013 			bind_target_changed = TRUE;
2014 			bind_target = master_processor;
2015 			sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
2016 		}
2017 	} else {
2018 		/*
2019 		 * Check if we're in a bad state, which is defined by high
2020 		 * latency with no core currently executing a thread. If a
2021 		 * single thread is making progress on a CPU, that means the
2022 		 * binding concept to reduce parallelism is working as
2023 		 * designed.
2024 		 */
2025 		if (high_latency_observed && !runnable_and_not_on_runq_observed) {
2026 			/* unbind */
2027 			bind_target_changed = TRUE;
2028 			bind_target = PROCESSOR_NULL;
2029 			sched_vm_group_temporarily_unbound = TRUE;
2030 		}
2031 	}
2032 
2033 	if (bind_target_changed) {
2034 		s = splsched();
2035 		for (i = 0; i < sched_vm_group_thread_count; i++) {
2036 			thread_t thread = sched_vm_group_thread_list[i];
2037 			boolean_t removed;
2038 			assert(thread != THREAD_NULL);
2039 
2040 			thread_lock(thread);
2041 			removed = thread_run_queue_remove(thread);
2042 			if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
2043 				thread_bind_internal(thread, bind_target);
2044 			} else {
2045 				/*
2046 				 * Thread was in the middle of being context-switched-to,
2047 				 * or was in the process of blocking. To avoid switching the bind
2048 				 * state out mid-flight, defer the change if possible.
2049 				 */
2050 				if (bind_target == PROCESSOR_NULL) {
2051 					thread_bind_internal(thread, bind_target);
2052 				} else {
2053 					sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
2054 				}
2055 			}
2056 
2057 			if (removed) {
2058 				thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
2059 			}
2060 			thread_unlock(thread);
2061 		}
2062 		splx(s);
2063 	}
2064 
2065 	simple_unlock(&sched_vm_group_list_lock);
2066 }
2067 
2068 #if defined(__x86_64__)
2069 #define SCHED_AVOID_CPU0 1
2070 #else
2071 #define SCHED_AVOID_CPU0 0
2072 #endif
2073 
2074 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
2075 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
2076 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
2077 
2078 int
sched_get_rt_n_backup_processors(void)2079 sched_get_rt_n_backup_processors(void)
2080 {
2081 	return sched_rt_n_backup_processors;
2082 }
2083 
2084 void
sched_set_rt_n_backup_processors(int n)2085 sched_set_rt_n_backup_processors(int n)
2086 {
2087 	if (n < 0) {
2088 		n = 0;
2089 	} else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
2090 		n = SCHED_MAX_BACKUP_PROCESSORS;
2091 	}
2092 
2093 	sched_rt_n_backup_processors = n;
2094 }
2095 
2096 /*
2097  * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2098  * rebalancing opportunity exists when a core is (instantaneously) idle, but
2099  * other SMT-capable cores may be over-committed. TODO: some possible negatives:
2100  * IPI thrash if this core does not remain idle following the load balancing ASTs
2101  * Idle "thrash", when IPI issue is followed by idle entry/core power down
2102  * followed by a wakeup shortly thereafter.
2103  */
2104 
2105 #if (DEVELOPMENT || DEBUG)
2106 int sched_smt_balance = 1;
2107 #endif
2108 
2109 #if CONFIG_SCHED_SMT
2110 /* Invoked with pset locked, returns with pset unlocked */
2111 bool
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)2112 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2113 {
2114 	processor_t ast_processor = NULL;
2115 
2116 #if (DEVELOPMENT || DEBUG)
2117 	if (__improbable(sched_smt_balance == 0)) {
2118 		goto smt_balance_exit;
2119 	}
2120 #endif
2121 
2122 	assert(cprocessor == current_processor());
2123 	if (cprocessor->is_SMT == FALSE) {
2124 		goto smt_balance_exit;
2125 	}
2126 
2127 	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2128 
2129 	/* Determine if both this processor and its sibling are idle,
2130 	 * indicating an SMT rebalancing opportunity.
2131 	 */
2132 	if (sib_processor->state != PROCESSOR_IDLE) {
2133 		goto smt_balance_exit;
2134 	}
2135 
2136 	processor_t sprocessor;
2137 
2138 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2139 	uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2140 	    ~cpset->primary_map);
2141 	for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2142 		sprocessor = processor_array[cpuid];
2143 		if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2144 		    (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2145 			ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2146 			if (ipi_type != SCHED_IPI_NONE) {
2147 				assert(sprocessor != cprocessor);
2148 				ast_processor = sprocessor;
2149 				break;
2150 			}
2151 		}
2152 	}
2153 
2154 smt_balance_exit:
2155 	pset_unlock(cpset);
2156 
2157 	if (ast_processor) {
2158 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2159 		sched_ipi_perform(ast_processor, ipi_type);
2160 	}
2161 	return false;
2162 }
2163 #else /* CONFIG_SCHED_SMT */
2164 /* Invoked with pset locked, returns with pset unlocked */
2165 bool
sched_SMT_balance(__unused processor_t cprocessor,__unused processor_set_t cpset)2166 sched_SMT_balance(__unused processor_t cprocessor, __unused processor_set_t cpset)
2167 {
2168 	pset_unlock(cpset);
2169 	return false;
2170 }
2171 #endif /* CONFIG_SCHED_SMT */
2172 
2173 int
pset_available_cpu_count(processor_set_t pset)2174 pset_available_cpu_count(processor_set_t pset)
2175 {
2176 	return bit_count(pset_available_cpumap(pset));
2177 }
2178 
2179 bool
pset_is_recommended(processor_set_t pset)2180 pset_is_recommended(processor_set_t pset)
2181 {
2182 	if (!pset) {
2183 		return false;
2184 	}
2185 	return pset_available_cpu_count(pset) > 0;
2186 }
2187 
2188 bool
pset_type_is_recommended(processor_set_t pset)2189 pset_type_is_recommended(processor_set_t pset)
2190 {
2191 	if (!pset) {
2192 		return false;
2193 	}
2194 	pset_map_t recommended_psets = os_atomic_load(&pset->node->pset_recommended_map, relaxed);
2195 	return bit_count(recommended_psets) > 0;
2196 }
2197 
2198 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2199 pset_available_but_not_running_cpumap(processor_set_t pset)
2200 {
2201 	return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2202 	       pset->recommended_bitmask;
2203 }
2204 
2205 bool
pset_has_stealable_threads(processor_set_t pset)2206 pset_has_stealable_threads(processor_set_t pset)
2207 {
2208 	pset_assert_locked(pset);
2209 
2210 	cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2211 #if CONFIG_SCHED_SMT
2212 	/*
2213 	 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2214 	 * available primary CPUs
2215 	 */
2216 	avail_map &= pset->primary_map;
2217 #endif /* CONFIG_SCHED_SMT */
2218 
2219 	return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2220 }
2221 
2222 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2223 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2224 {
2225 	/* Acknowledge any pending IPIs here with pset lock held */
2226 	pset_assert_locked(pset);
2227 	if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2228 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2229 		    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2230 	}
2231 	bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2232 
2233 #if defined(CONFIG_SCHED_DEFERRED_AST)
2234 	bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2235 #endif
2236 }
2237 
2238 /*
2239  * Called with pset locked, on a processor that is committing to run a new thread
2240  * Will transition an idle or dispatching processor to running as it picks up
2241  * the first new thread from the idle thread.
2242  */
2243 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2244 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2245 {
2246 	pset_assert_locked(pset);
2247 
2248 	if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2249 		assert(current_thread() == processor->idle_thread);
2250 
2251 		/*
2252 		 * Dispatching processor is now committed to running new_thread,
2253 		 * so change its state to PROCESSOR_RUNNING.
2254 		 */
2255 		pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2256 	} else {
2257 		assert(processor->state == PROCESSOR_RUNNING);
2258 	}
2259 
2260 	processor_state_update_from_thread(processor, new_thread, true);
2261 
2262 	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2263 		bit_set(pset->realtime_map, processor->cpu_id);
2264 	} else {
2265 		bit_clear(pset->realtime_map, processor->cpu_id);
2266 	}
2267 	pset_update_rt_stealable_state(pset);
2268 
2269 	pset_node_t node = pset->node;
2270 
2271 	if (bit_count(node->pset_map) == 1) {
2272 		/* Node has only a single pset, so skip node pset map updates */
2273 		return;
2274 	}
2275 
2276 	cpumap_t avail_map = pset_available_cpumap(pset);
2277 
2278 	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2279 		if ((avail_map & pset->realtime_map) == avail_map) {
2280 			/* No more non-RT CPUs in this pset */
2281 			atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2282 		}
2283 #if CONFIG_SCHED_SMT
2284 		avail_map &= pset->primary_map;
2285 		if ((avail_map & pset->realtime_map) == avail_map) {
2286 			/* No more non-RT primary CPUs in this pset */
2287 			atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2288 		}
2289 #endif /* CONFIG_SCHED_SMT */
2290 	} else {
2291 		if ((avail_map & pset->realtime_map) != avail_map) {
2292 			if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2293 				atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2294 			}
2295 		}
2296 #if CONFIG_SCHED_SMT
2297 		avail_map &= pset->primary_map;
2298 		if ((avail_map & pset->realtime_map) != avail_map) {
2299 			if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2300 				atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2301 			}
2302 		}
2303 #endif /* CONFIG_SCHED_SMT */
2304 	}
2305 }
2306 
2307 #if CONFIG_SCHED_SMT
2308 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2309 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2310 #else /* !CONFIG_SCHED_SMT */
2311 processor_t pset_choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills);
2312 #endif /* !CONFIG_SCHED_SMT */
2313 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2314 
2315 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2316 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2317 {
2318 	pset_map_t pset_map = stealing_pset->node->pset_map;
2319 
2320 	bit_clear(pset_map, stealing_pset->pset_id);
2321 
2322 	for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2323 		processor_set_t nset = pset_array[pset_id];
2324 
2325 		if (rt_deadline_add(os_atomic_load(&nset->stealable_rt_threads_earliest_deadline, relaxed), rt_deadline_epsilon) < earliest_deadline) {
2326 			return true;
2327 		}
2328 	}
2329 
2330 	return false;
2331 }
2332 
2333 /*
2334  * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2335  * followup processor - used in thread_select when there are still threads on the run queue and available processors
2336  * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2337  */
2338 typedef enum {
2339 	none,
2340 	backup,
2341 	followup,
2342 	spill
2343 } next_processor_type_t;
2344 
2345 #undef LOOP_COUNT
2346 #ifdef LOOP_COUNT
2347 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2348 #endif
2349 
2350 /*
2351  *	thread_select:
2352  *
2353  *	Select a new thread for the current processor to execute.
2354  *
2355  *	May select the current thread, which must be locked.
2356  */
2357 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2358 thread_select(thread_t          thread,
2359     processor_t       processor,
2360     ast_t            *reason)
2361 {
2362 	processor_set_t         pset = processor->processor_set;
2363 	thread_t                        new_thread = THREAD_NULL;
2364 
2365 	assert(processor == current_processor());
2366 	assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2367 
2368 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2369 	    0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2370 
2371 	__kdebug_only int idle_reason = 0;
2372 	__kdebug_only int delay_count = 0;
2373 
2374 #if CONFIG_SCHED_SMT
2375 	int timeout_count = sched_backup_cpu_timeout_count;
2376 	if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2377 		/* Prefer cpu0 as backup */
2378 		timeout_count--;
2379 	} else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2380 		/* Prefer secondary cpu as backup */
2381 		timeout_count--;
2382 	}
2383 #endif /* CONFIG_SCHED_SMT */
2384 	bool pending_AST_URGENT = false;
2385 	bool pending_AST_PREEMPT = false;
2386 
2387 #ifdef LOOP_COUNT
2388 	int loop_count = -1;
2389 #endif
2390 
2391 	do {
2392 		/*
2393 		 *	Update the priority.
2394 		 */
2395 		if (SCHED(can_update_priority)(thread)) {
2396 			SCHED(update_priority)(thread);
2397 		}
2398 
2399 		pset_lock(pset);
2400 
2401 restart:
2402 #ifdef LOOP_COUNT
2403 		loop_count++;
2404 		if (loop_count > max_loop_count[processor->cpu_id]) {
2405 			max_loop_count[processor->cpu_id] = loop_count;
2406 			if (bit_count(loop_count) == 1) {
2407 				kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2408 			}
2409 		}
2410 #endif
2411 		pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2412 		pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2413 
2414 		processor_state_update_from_thread(processor, thread, true);
2415 
2416 		idle_reason = 0;
2417 
2418 		processor_t ast_processor = PROCESSOR_NULL;
2419 		processor_t next_rt_processor = PROCESSOR_NULL;
2420 		sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2421 		sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2422 
2423 		assert(processor->state != PROCESSOR_OFF_LINE);
2424 
2425 		/*
2426 		 * Bound threads are dispatched to a processor without going through
2427 		 * choose_processor(), so in those cases we must continue trying to dequeue work
2428 		 * as we are the only option.
2429 		 */
2430 		if (!SCHED(processor_bound_count)(processor)) {
2431 			if (!processor->is_recommended) {
2432 				/*
2433 				 * The performance controller has provided a hint to not dispatch more threads,
2434 				 */
2435 				idle_reason = 1;
2436 				goto send_followup_ipi_before_idle;
2437 			} else if (rt_runq_count(pset)) {
2438 				bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2439 				/* Give the current RT thread a chance to complete */
2440 				ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2441 #if CONFIG_SCHED_SMT
2442 				/*
2443 				 * On Intel we want to avoid SMT secondary processors and processor 0
2444 				 * but allow them to be used as backup processors in case the preferred chosen
2445 				 * processor is delayed by interrupts or processor stalls.  So if it is
2446 				 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2447 				 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2448 				 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2449 				 * to grab the thread before the (current) backup processor does.
2450 				 *
2451 				 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2452 				 * on DEVELOPMENT || DEBUG kernels.  It is also adjusted (see above) depending on whether we want to use
2453 				 * cpu0 before secondary cpus or not.
2454 				 */
2455 				if (!ok_to_run_realtime_thread) {
2456 					if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2457 						if (timeout_count-- > 0) {
2458 							pset_unlock(pset);
2459 							thread_unlock(thread);
2460 							delay(10);
2461 							delay_count++;
2462 							thread_lock(thread);
2463 							pset_lock(pset);
2464 							goto restart;
2465 						}
2466 						ok_to_run_realtime_thread = true;
2467 					}
2468 				}
2469 #endif /* CONFIG_SCHED_SMT */
2470 				if (!ok_to_run_realtime_thread) {
2471 					idle_reason = 2;
2472 					goto send_followup_ipi_before_idle;
2473 				}
2474 			}
2475 #if CONFIG_SCHED_SMT
2476 			else if (processor->processor_primary != processor) {
2477 				/*
2478 				 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2479 				 * we should look for work only under the same conditions that choose_processor()
2480 				 * would have assigned work, which is when all primary processors have been assigned work.
2481 				 */
2482 				if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2483 					/* There are idle primaries */
2484 					idle_reason = 3;
2485 					goto idle;
2486 				}
2487 			}
2488 #endif /* CONFIG_SCHED_SMT */
2489 		}
2490 
2491 		/*
2492 		 *	Test to see if the current thread should continue
2493 		 *	to run on this processor.  Must not be attempting to wait, and not
2494 		 *	bound to a different processor, nor be in the wrong
2495 		 *	processor set, nor be forced to context switch by TH_SUSP.
2496 		 *
2497 		 *	Note that there are never any RT threads in the regular runqueue.
2498 		 *
2499 		 *	This code is very insanely tricky.
2500 		 */
2501 
2502 		/* i.e. not waiting, not TH_SUSP'ed */
2503 		bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2504 
2505 		/*
2506 		 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2507 		 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2508 		 *       <rdar://problem/47907700>
2509 		 *
2510 		 * A yielding thread shouldn't be forced to context switch.
2511 		 */
2512 
2513 		bool is_yielding         = (*reason & AST_YIELD) == AST_YIELD;
2514 
2515 #if CONFIG_SCHED_SMT
2516 		bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2517 #endif /* CONFIG_SCHED_SMT */
2518 
2519 		bool affinity_mismatch   = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2520 
2521 		bool bound_elsewhere     = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2522 
2523 		bool avoid_processor     = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, *reason);
2524 
2525 		bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2526 
2527 		bool current_thread_can_keep_running = (
2528 			still_running
2529 #if CONFIG_SCHED_SMT
2530 			&& !needs_smt_rebalance
2531 #endif /* CONFIG_SCHED_SMT */
2532 			&& !affinity_mismatch
2533 			&& !bound_elsewhere
2534 			&& !avoid_processor);
2535 		if (current_thread_can_keep_running) {
2536 			/*
2537 			 * This thread is eligible to keep running on this processor.
2538 			 *
2539 			 * RT threads with un-expired quantum stay on processor,
2540 			 * unless there's a valid RT thread with an earlier deadline
2541 			 * and it is still ok_to_run_realtime_thread.
2542 			 */
2543 			if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2544 				/*
2545 				 * Pick a new RT thread only if ok_to_run_realtime_thread
2546 				 * (but the current thread is allowed to complete).
2547 				 */
2548 				if (ok_to_run_realtime_thread) {
2549 					if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2550 						goto pick_new_rt_thread;
2551 					}
2552 					if (rt_runq_priority(pset) > thread->sched_pri) {
2553 						if (sched_rt_runq_strict_priority) {
2554 							/* The next RT thread is better, so pick it off the runqueue. */
2555 							goto pick_new_rt_thread;
2556 						}
2557 
2558 						/*
2559 						 * See if the current lower priority thread can continue to run without causing
2560 						 * the higher priority thread on the runq queue to miss its deadline.
2561 						 */
2562 						thread_t hi_thread = rt_runq_first(&pset->rt_runq);
2563 						if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2564 							/* The next RT thread is better, so pick it off the runqueue. */
2565 							goto pick_new_rt_thread;
2566 						}
2567 					} else if ((rt_runq_count(pset) > 0) && (rt_deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2568 						/* The next RT thread is better, so pick it off the runqueue. */
2569 						goto pick_new_rt_thread;
2570 					}
2571 					if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2572 						goto pick_new_rt_thread;
2573 					}
2574 				}
2575 
2576 				/* This is still the best RT thread to run. */
2577 				processor->deadline = thread->realtime.deadline;
2578 
2579 				SCHED(update_pset_load_average)(pset, 0);
2580 
2581 				clear_pending_AST_bits(pset, processor, 1);
2582 
2583 				next_rt_processor = PROCESSOR_NULL;
2584 				next_rt_ipi_type = SCHED_IPI_NONE;
2585 
2586 				bool pset_unlocked = false;
2587 				next_processor_type_t nptype = none;
2588 #if CONFIG_SCHED_EDGE
2589 				if (rt_pset_has_stealable_threads(pset)) {
2590 					nptype = spill;
2591 					pset_unlocked = rt_choose_next_processor_for_spill_IPI(pset, processor, &next_rt_processor, &next_rt_ipi_type);
2592 				}
2593 #endif /* CONFIG_SCHED_EDGE */
2594 				if (nptype == none && rt_pset_needs_a_followup_IPI(pset)) {
2595 					nptype = followup;
2596 					rt_choose_next_processor_for_followup_IPI(pset, processor, &next_rt_processor, &next_rt_ipi_type);
2597 				}
2598 				if (!pset_unlocked) {
2599 					pset_unlock(pset);
2600 				}
2601 
2602 				if (next_rt_processor) {
2603 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2604 					    next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2605 					sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2606 				}
2607 
2608 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2609 				    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2610 				return thread;
2611 			}
2612 
2613 			if ((rt_runq_count(pset) == 0) &&
2614 			    SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2615 				/* This thread is still the highest priority runnable (non-idle) thread */
2616 				processor->deadline = RT_DEADLINE_NONE;
2617 
2618 				SCHED(update_pset_load_average)(pset, 0);
2619 
2620 				clear_pending_AST_bits(pset, processor, 2);
2621 
2622 				pset_unlock(pset);
2623 
2624 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2625 				    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2626 				return thread;
2627 			}
2628 		} else {
2629 			/*
2630 			 * This processor must context switch.
2631 			 * If it's due to a rebalance, we should aggressively find this thread a new home.
2632 			 */
2633 			bool ast_rebalance = affinity_mismatch || bound_elsewhere || avoid_processor;
2634 #if CONFIG_SCHED_SMT
2635 			ast_rebalance = ast_rebalance || needs_smt_rebalance;
2636 #endif /* CONFIG_SCHED_SMT */
2637 			if (ast_rebalance) {
2638 				*reason |= AST_REBALANCE;
2639 			}
2640 		}
2641 
2642 #if CONFIG_SCHED_SMT
2643 		bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2644 		    (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2645 		    (processor->processor_secondary->state == PROCESSOR_IDLE));
2646 #endif /* CONFIG_SCHED_SMT */
2647 
2648 		/* OK, so we're not going to run the current thread. Look at the RT queue. */
2649 		if (ok_to_run_realtime_thread) {
2650 pick_new_rt_thread:
2651 			/* sched_rt_choose_thread may drop and re-take the processor's pset lock. */
2652 			new_thread = sched_rt_choose_thread(processor);
2653 			pset_assert_locked(pset);
2654 			if (new_thread != THREAD_NULL) {
2655 				processor->deadline = new_thread->realtime.deadline;
2656 				pset_commit_processor_to_new_thread(pset, processor, new_thread);
2657 
2658 				clear_pending_AST_bits(pset, processor, 3);
2659 
2660 #if CONFIG_SCHED_SMT
2661 				if (processor->processor_secondary != NULL) {
2662 					processor_t sprocessor = processor->processor_secondary;
2663 					if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2664 						ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2665 						ast_processor = sprocessor;
2666 					}
2667 				}
2668 #endif /* CONFIG_SCHED_SMT */
2669 			}
2670 		}
2671 
2672 send_followup_ipi_before_idle:
2673 		/* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2674 		rt_clear_pending_spill(processor, 5);
2675 		next_processor_type_t nptype = none;
2676 		bool pset_unlocked = false;
2677 #if CONFIG_SCHED_EDGE
2678 		if (rt_pset_has_stealable_threads(pset)) {
2679 			nptype = spill;
2680 			pset_unlocked = rt_choose_next_processor_for_spill_IPI(pset, processor, &next_rt_processor, &next_rt_ipi_type);
2681 		}
2682 #endif /* CONFIG_SCHED_EDGE */
2683 		if (nptype == none && rt_pset_needs_a_followup_IPI(pset)) {
2684 			nptype = followup;
2685 			rt_choose_next_processor_for_followup_IPI(pset, processor, &next_rt_processor, &next_rt_ipi_type);
2686 		}
2687 
2688 		assert(new_thread || !ast_processor);
2689 		if (new_thread || next_rt_processor) {
2690 			if (!pset_unlocked) {
2691 				pset_unlock(pset);
2692 				pset_unlocked = true;
2693 			}
2694 			if (ast_processor == next_rt_processor) {
2695 				ast_processor = PROCESSOR_NULL;
2696 				ipi_type = SCHED_IPI_NONE;
2697 			}
2698 
2699 			if (ast_processor) {
2700 				sched_ipi_perform(ast_processor, ipi_type);
2701 			}
2702 
2703 			if (next_rt_processor) {
2704 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2705 				    next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2706 				sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2707 			}
2708 
2709 			if (new_thread) {
2710 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2711 				    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2712 				return new_thread;
2713 			}
2714 		}
2715 
2716 		if (pset_unlocked) {
2717 			pset_lock(pset);
2718 		}
2719 
2720 		if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2721 			/* Things changed while we dropped the lock */
2722 			goto restart;
2723 		}
2724 
2725 		if (processor->is_recommended) {
2726 			bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2727 			if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2728 				/* Things changed while we dropped the lock */
2729 				goto restart;
2730 			}
2731 
2732 #if CONFIG_SCHED_SMT
2733 			if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2734 				/* secondary can only run realtime thread */
2735 				if (idle_reason == 0) {
2736 					idle_reason = 4;
2737 				}
2738 				goto idle;
2739 			}
2740 #endif /* CONFIG_SCHED_SMT */
2741 		} else if (!SCHED(processor_bound_count)(processor)) {
2742 			/* processor not recommended and no bound threads */
2743 			if (idle_reason == 0) {
2744 				idle_reason = 5;
2745 			}
2746 			goto idle;
2747 		}
2748 
2749 		processor->deadline = RT_DEADLINE_NONE;
2750 
2751 		/* No RT threads, so let's look at the regular threads. */
2752 		if ((new_thread = SCHED(choose_thread)(processor, MINPRI, current_thread_can_keep_running ? thread : THREAD_NULL, *reason)) != THREAD_NULL) {
2753 			if (new_thread != thread) {
2754 				/* Going to context-switch */
2755 				pset_commit_processor_to_new_thread(pset, processor, new_thread);
2756 
2757 				clear_pending_AST_bits(pset, processor, 4);
2758 
2759 				ast_processor = PROCESSOR_NULL;
2760 				ipi_type = SCHED_IPI_NONE;
2761 
2762 #if CONFIG_SCHED_SMT
2763 				processor_t sprocessor = processor->processor_secondary;
2764 				if (sprocessor != NULL) {
2765 					if (sprocessor->state == PROCESSOR_RUNNING) {
2766 						if (thread_no_smt(new_thread)) {
2767 							ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2768 							ast_processor = sprocessor;
2769 						}
2770 					} else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
2771 						ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
2772 						ast_processor = sprocessor;
2773 					}
2774 				}
2775 #endif /* CONFIG_SCHED_SMT */
2776 
2777 				pset_unlock(pset);
2778 
2779 				if (ast_processor) {
2780 					sched_ipi_perform(ast_processor, ipi_type);
2781 				}
2782 			} else {
2783 				/* Will continue running the current thread */
2784 				clear_pending_AST_bits(pset, processor, 4);
2785 				pset_unlock(pset);
2786 			}
2787 
2788 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2789 			    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2790 			return new_thread;
2791 		}
2792 
2793 		if (processor->must_idle) {
2794 			processor->must_idle = false;
2795 			*reason |= AST_REBALANCE;
2796 			idle_reason = 6;
2797 			goto idle;
2798 		}
2799 
2800 		if (SCHED(steal_thread_enabled)(pset)
2801 #if CONFIG_SCHED_SMT
2802 		    && (processor->processor_primary == processor)
2803 #endif /* CONFIG_SCHED_SMT */
2804 		    ) {
2805 			/*
2806 			 * No runnable threads, attempt to steal
2807 			 * from other processors. Returns with pset lock dropped.
2808 			 */
2809 
2810 			if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2811 				pset_lock(pset);
2812 				pset_commit_processor_to_new_thread(pset, processor, new_thread);
2813 				if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2814 					/*
2815 					 * A realtime thread choose this processor while it was DISPATCHING
2816 					 * and the pset lock was dropped
2817 					 */
2818 					ast_on(AST_URGENT | AST_PREEMPT);
2819 				}
2820 
2821 				clear_pending_AST_bits(pset, processor, 5);
2822 
2823 				pset_unlock(pset);
2824 
2825 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2826 				    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2827 				return new_thread;
2828 			}
2829 
2830 			/*
2831 			 * If other threads have appeared, shortcut
2832 			 * around again.
2833 			 */
2834 			if (SCHED(processor_bound_count)(processor)) {
2835 				continue;
2836 			}
2837 			if (processor->is_recommended) {
2838 				if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2839 					continue;
2840 				}
2841 			}
2842 
2843 			pset_lock(pset);
2844 		}
2845 
2846 idle:
2847 		/* Someone selected this processor while we had dropped the lock */
2848 		if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2849 		    (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2850 			goto restart;
2851 		}
2852 
2853 		if ((idle_reason == 0) && current_thread_can_keep_running) {
2854 			/* This thread is the only runnable (non-idle) thread */
2855 			if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2856 				processor->deadline = thread->realtime.deadline;
2857 			} else {
2858 				processor->deadline = RT_DEADLINE_NONE;
2859 			}
2860 
2861 			SCHED(update_pset_load_average)(pset, 0);
2862 
2863 			clear_pending_AST_bits(pset, processor, 6);
2864 
2865 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2866 			    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2867 			pset_unlock(pset);
2868 			return thread;
2869 		}
2870 
2871 		/*
2872 		 *	Nothing is runnable, or this processor must be forced idle,
2873 		 *	so set this processor idle if it was running.
2874 		 */
2875 		if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2876 			pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2877 			processor_state_update_idle(processor);
2878 		}
2879 		pset_update_rt_stealable_state(pset);
2880 
2881 		clear_pending_AST_bits(pset, processor, 7);
2882 
2883 		/* Invoked with pset locked, returns with pset unlocked */
2884 		processor->next_idle_short = SCHED(processor_balance)(processor, pset);
2885 
2886 		new_thread = processor->idle_thread;
2887 	} while (new_thread == THREAD_NULL);
2888 
2889 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2890 	    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2891 	return new_thread;
2892 }
2893 
2894 /*
2895  * thread_invoke
2896  *
2897  * Called at splsched with neither thread locked.
2898  *
2899  * Perform a context switch and start executing the new thread.
2900  *
2901  * Returns FALSE when the context switch didn't happen.
2902  * The reference to the new thread is still consumed.
2903  *
2904  * "self" is what is currently running on the processor,
2905  * "thread" is the new thread to context switch to
2906  * (which may be the same thread in some cases)
2907  */
2908 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)2909 thread_invoke(
2910 	thread_t                        self,
2911 	thread_t                        thread,
2912 	ast_t                           reason)
2913 {
2914 	if (__improbable(get_preemption_level() != 0)) {
2915 		int pl = get_preemption_level();
2916 		panic("thread_invoke: preemption_level %d, possible cause: %s",
2917 		    pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2918 		    "blocking while holding a spinlock, or within interrupt context"));
2919 	}
2920 
2921 	thread_continue_t       continuation = self->continuation;
2922 	void                    *parameter   = self->parameter;
2923 
2924 	struct recount_snap snap = { 0 };
2925 	recount_snapshot(&snap);
2926 	uint64_t ctime = snap.rsn_time_mach;
2927 
2928 	check_monotonic_time(ctime);
2929 
2930 #ifdef CONFIG_MACH_APPROXIMATE_TIME
2931 	commpage_update_mach_approximate_time(ctime);
2932 #endif
2933 
2934 	if (ctime < thread->last_made_runnable_time) {
2935 		panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
2936 		    ctime, thread->last_made_runnable_time);
2937 	}
2938 
2939 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
2940 	if (!((thread->state & TH_IDLE) != 0 ||
2941 	    ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
2942 		sched_timeshare_consider_maintenance(ctime, true);
2943 	}
2944 #endif
2945 
2946 	recount_log_switch_thread(&snap);
2947 
2948 	processor_t processor = current_processor();
2949 
2950 	if (!processor->processor_online) {
2951 		panic("Invalid attempt to context switch an offline processor");
2952 	}
2953 
2954 	assert_thread_magic(self);
2955 	assert(self == current_thread());
2956 	thread_assert_runq_null(self);
2957 	assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2958 
2959 	thread_lock(thread);
2960 
2961 	assert_thread_magic(thread);
2962 	assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
2963 	assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor);
2964 	thread_assert_runq_null(thread);
2965 
2966 	/* Update SFI class based on other factors */
2967 	thread->sfi_class = sfi_thread_classify(thread);
2968 
2969 	/* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
2970 	thread->same_pri_latency = ctime - thread->last_basepri_change_time;
2971 	/*
2972 	 * In case a base_pri update happened between the timestamp and
2973 	 * taking the thread lock
2974 	 */
2975 	if (ctime <= thread->last_basepri_change_time) {
2976 		thread->same_pri_latency = ctime - thread->last_made_runnable_time;
2977 	}
2978 
2979 	/* Allow realtime threads to hang onto a stack. */
2980 	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
2981 		self->reserved_stack = self->kernel_stack;
2982 	}
2983 
2984 	/* Prepare for spin debugging */
2985 #if SCHED_HYGIENE_DEBUG
2986 	ml_spin_debug_clear(thread);
2987 #endif
2988 
2989 	if (continuation != NULL) {
2990 		if (!thread->kernel_stack) {
2991 			/*
2992 			 * If we are using a privileged stack,
2993 			 * check to see whether we can exchange it with
2994 			 * that of the other thread.
2995 			 */
2996 			if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
2997 				goto need_stack;
2998 			}
2999 
3000 			/*
3001 			 * Context switch by performing a stack handoff.
3002 			 * Requires both threads to be parked in a continuation.
3003 			 */
3004 			continuation = thread->continuation;
3005 			parameter = thread->parameter;
3006 
3007 			processor->active_thread = thread;
3008 			processor_state_update_from_thread(processor, thread, false);
3009 
3010 			if (thread->last_processor != processor && thread->last_processor != NULL) {
3011 				if (thread->last_processor->processor_set != processor->processor_set) {
3012 					thread->ps_switch++;
3013 				}
3014 				thread->p_switch++;
3015 			}
3016 			thread->last_processor = processor;
3017 			thread->c_switch++;
3018 			ast_context(thread);
3019 
3020 			thread_unlock(thread);
3021 
3022 			self->reason = reason;
3023 
3024 			processor->last_dispatch = ctime;
3025 			self->last_run_time = ctime;
3026 			timer_update(&thread->runnable_timer, ctime);
3027 			recount_switch_thread(&snap, self, get_threadtask(self));
3028 
3029 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3030 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
3031 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3032 
3033 			if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3034 				SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3035 				    (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3036 			}
3037 
3038 			DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3039 
3040 			SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3041 
3042 #if KPERF
3043 			kperf_off_cpu(self);
3044 #endif /* KPERF */
3045 
3046 			/*
3047 			 * This is where we actually switch thread identity,
3048 			 * and address space if required.  However, register
3049 			 * state is not switched - this routine leaves the
3050 			 * stack and register state active on the current CPU.
3051 			 */
3052 			TLOG(1, "thread_invoke: calling stack_handoff\n");
3053 			stack_handoff(self, thread);
3054 
3055 			/* 'self' is now off core */
3056 			assert(thread == current_thread_volatile());
3057 
3058 			DTRACE_SCHED(on__cpu);
3059 
3060 #if KPERF
3061 			kperf_on_cpu(thread, continuation, NULL);
3062 #endif /* KPERF */
3063 
3064 
3065 			recount_log_switch_thread_on(&snap);
3066 
3067 			thread_dispatch(self, thread);
3068 
3069 #if KASAN
3070 			/* Old thread's stack has been moved to the new thread, so explicitly
3071 			 * unpoison it. */
3072 			kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3073 #endif
3074 
3075 			thread->continuation = thread->parameter = NULL;
3076 
3077 			boolean_t enable_interrupts = TRUE;
3078 
3079 			/* idle thread needs to stay interrupts-disabled */
3080 			if ((thread->state & TH_IDLE)) {
3081 				enable_interrupts = FALSE;
3082 			}
3083 
3084 			assert(continuation);
3085 			call_continuation(continuation, parameter,
3086 			    thread->wait_result, enable_interrupts);
3087 			/*NOTREACHED*/
3088 		} else if (thread == self) {
3089 			/* same thread but with continuation */
3090 			ast_context(self);
3091 
3092 			thread_unlock(self);
3093 
3094 #if KPERF
3095 			kperf_on_cpu(thread, continuation, NULL);
3096 #endif /* KPERF */
3097 
3098 			recount_log_switch_thread_on(&snap);
3099 
3100 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3101 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3102 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3103 
3104 #if KASAN
3105 			/* stack handoff to self - no thread_dispatch(), so clear the stack
3106 			 * and free the fakestack directly */
3107 #if KASAN_CLASSIC
3108 			kasan_fakestack_drop(self);
3109 			kasan_fakestack_gc(self);
3110 #endif /* KASAN_CLASSIC */
3111 			kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3112 #endif /* KASAN */
3113 
3114 			self->continuation = self->parameter = NULL;
3115 
3116 			boolean_t enable_interrupts = TRUE;
3117 
3118 			/* idle thread needs to stay interrupts-disabled */
3119 			if ((self->state & TH_IDLE)) {
3120 				enable_interrupts = FALSE;
3121 			}
3122 
3123 			call_continuation(continuation, parameter,
3124 			    self->wait_result, enable_interrupts);
3125 			/*NOTREACHED*/
3126 		}
3127 	} else {
3128 		/*
3129 		 * Check that the other thread has a stack
3130 		 */
3131 		if (!thread->kernel_stack) {
3132 need_stack:
3133 			if (!stack_alloc_try(thread)) {
3134 				thread_unlock(thread);
3135 				thread_stack_enqueue(thread);
3136 				return FALSE;
3137 			}
3138 		} else if (thread == self) {
3139 			ast_context(self);
3140 			thread_unlock(self);
3141 
3142 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3143 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3144 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3145 
3146 			return TRUE;
3147 		}
3148 	}
3149 
3150 	/*
3151 	 * Context switch by full context save.
3152 	 */
3153 	processor->active_thread = thread;
3154 	processor_state_update_from_thread(processor, thread, false);
3155 
3156 	if (thread->last_processor != processor && thread->last_processor != NULL) {
3157 		if (thread->last_processor->processor_set != processor->processor_set) {
3158 			thread->ps_switch++;
3159 		}
3160 		thread->p_switch++;
3161 	}
3162 	thread->last_processor = processor;
3163 	thread->c_switch++;
3164 	ast_context(thread);
3165 
3166 	thread_unlock(thread);
3167 
3168 	self->reason = reason;
3169 
3170 	processor->last_dispatch = ctime;
3171 	self->last_run_time = ctime;
3172 	timer_update(&thread->runnable_timer, ctime);
3173 	recount_switch_thread(&snap, self, get_threadtask(self));
3174 
3175 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3176 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3177 	    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3178 
3179 	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3180 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3181 		    (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3182 	}
3183 
3184 	DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3185 
3186 	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3187 
3188 #if KPERF
3189 	kperf_off_cpu(self);
3190 #endif /* KPERF */
3191 
3192 	/*
3193 	 * This is where we actually switch register context,
3194 	 * and address space if required.  We will next run
3195 	 * as a result of a subsequent context switch.
3196 	 *
3197 	 * Once registers are switched and the processor is running "thread",
3198 	 * the stack variables and non-volatile registers will contain whatever
3199 	 * was there the last time that thread blocked. No local variables should
3200 	 * be used after this point, except for the special case of "thread", which
3201 	 * the platform layer returns as the previous thread running on the processor
3202 	 * via the function call ABI as a return register, and "self", which may have
3203 	 * been stored on the stack or a non-volatile register, but a stale idea of
3204 	 * what was on the CPU is newly-accurate because that thread is again
3205 	 * running on the CPU.
3206 	 *
3207 	 * If one of the threads is using a continuation, thread_continue
3208 	 * is used to stitch up its context.
3209 	 *
3210 	 * If we are invoking a thread which is resuming from a continuation,
3211 	 * the CPU will invoke thread_continue next.
3212 	 *
3213 	 * If the current thread is parking in a continuation, then its state
3214 	 * won't be saved and the stack will be discarded. When the stack is
3215 	 * re-allocated, it will be configured to resume from thread_continue.
3216 	 */
3217 
3218 	assert(continuation == self->continuation);
3219 	thread = machine_switch_context(self, continuation, thread);
3220 	assert(self == current_thread_volatile());
3221 	TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3222 
3223 	assert(continuation == NULL && self->continuation == NULL);
3224 
3225 	DTRACE_SCHED(on__cpu);
3226 
3227 #if KPERF
3228 	kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3229 #endif /* KPERF */
3230 
3231 
3232 	/* Previous snap on the old stack is gone. */
3233 	recount_log_switch_thread_on(NULL);
3234 
3235 	/* We have been resumed and are set to run. */
3236 	thread_dispatch(thread, self);
3237 
3238 	return TRUE;
3239 }
3240 
3241 #if defined(CONFIG_SCHED_DEFERRED_AST)
3242 /*
3243  *	pset_cancel_deferred_dispatch:
3244  *
3245  *	Cancels all ASTs that we can cancel for the given processor set
3246  *	if the current processor is running the last runnable thread in the
3247  *	system.
3248  *
3249  *	This function assumes the current thread is runnable.  This must
3250  *	be called with the pset unlocked.
3251  */
3252 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3253 pset_cancel_deferred_dispatch(
3254 	processor_set_t         pset,
3255 	processor_t             processor)
3256 {
3257 	processor_t             active_processor = NULL;
3258 	uint32_t                sampled_sched_run_count;
3259 
3260 	pset_lock(pset);
3261 	sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3262 
3263 	/*
3264 	 * If we have emptied the run queue, and our current thread is runnable, we
3265 	 * should tell any processors that are still DISPATCHING that they will
3266 	 * probably not have any work to do.  In the event that there are no
3267 	 * pending signals that we can cancel, this is also uninteresting.
3268 	 *
3269 	 * In the unlikely event that another thread becomes runnable while we are
3270 	 * doing this (sched_run_count is atomically updated, not guarded), the
3271 	 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3272 	 * in order to dispatch it to a processor in our pset.  So, the other
3273 	 * codepath will wait while we squash all cancelable ASTs, get the pset
3274 	 * lock, and then dispatch the freshly runnable thread.  So this should be
3275 	 * correct (we won't accidentally have a runnable thread that hasn't been
3276 	 * dispatched to an idle processor), if not ideal (we may be restarting the
3277 	 * dispatch process, which could have some overhead).
3278 	 */
3279 
3280 	if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3281 		uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3282 		    pset->pending_deferred_AST_cpu_mask &
3283 		    ~pset->pending_AST_URGENT_cpu_mask);
3284 		for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3285 			active_processor = processor_array[cpuid];
3286 			/*
3287 			 * If a processor is DISPATCHING, it could be because of
3288 			 * a cancelable signal.
3289 			 *
3290 			 * IF the processor is not our
3291 			 * current processor (the current processor should not
3292 			 * be DISPATCHING, so this is a bit paranoid), AND there
3293 			 * is a cancelable signal pending on the processor, AND
3294 			 * there is no non-cancelable signal pending (as there is
3295 			 * no point trying to backtrack on bringing the processor
3296 			 * up if a signal we cannot cancel is outstanding), THEN
3297 			 * it should make sense to roll back the processor state
3298 			 * to the IDLE state.
3299 			 *
3300 			 * If the racey nature of this approach (as the signal
3301 			 * will be arbitrated by hardware, and can fire as we
3302 			 * roll back state) results in the core responding
3303 			 * despite being pushed back to the IDLE state, it
3304 			 * should be no different than if the core took some
3305 			 * interrupt while IDLE.
3306 			 */
3307 			if (active_processor != processor) {
3308 				/*
3309 				 * Squash all of the processor state back to some
3310 				 * reasonable facsimile of PROCESSOR_IDLE.
3311 				 */
3312 
3313 				processor_state_update_idle(active_processor);
3314 				active_processor->deadline = RT_DEADLINE_NONE;
3315 				pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3316 				bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3317 				machine_signal_idle_cancel(active_processor);
3318 			}
3319 		}
3320 	}
3321 
3322 	pset_unlock(pset);
3323 }
3324 #else
3325 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3326 #endif
3327 
3328 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3329 thread_csw_callout(
3330 	thread_t            old,
3331 	thread_t            new,
3332 	uint64_t            timestamp)
3333 {
3334 	perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3335 	uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3336 	machine_switch_perfcontrol_context(event, timestamp, 0,
3337 	    same_pri_latency, old, new);
3338 }
3339 
3340 
3341 /*
3342  *	thread_dispatch:
3343  *
3344  *	Handle threads at context switch.  Re-dispatch other thread
3345  *	if still running, otherwise update run state and perform
3346  *	special actions.  Update quantum for other thread and begin
3347  *	the quantum for ourselves.
3348  *
3349  *      "thread" is the old thread that we have switched away from.
3350  *      "self" is the new current thread that we have context switched to
3351  *
3352  *	Called at splsched.
3353  *
3354  */
3355 void
thread_dispatch(thread_t thread,thread_t self)3356 thread_dispatch(
3357 	thread_t                thread,
3358 	thread_t                self)
3359 {
3360 	processor_t             processor = self->last_processor;
3361 	bool was_idle = false;
3362 	bool processor_bootstrap = (thread == THREAD_NULL);
3363 
3364 	assert(processor == current_processor());
3365 	assert(self == current_thread_volatile());
3366 	assert(thread != self);
3367 
3368 	if (thread != THREAD_NULL) {
3369 		/*
3370 		 * Do the perfcontrol callout for context switch.
3371 		 * The reason we do this here is:
3372 		 * - thread_dispatch() is called from various places that are not
3373 		 *   the direct context switch path for eg. processor shutdown etc.
3374 		 *   So adding the callout here covers all those cases.
3375 		 * - We want this callout as early as possible to be close
3376 		 *   to the timestamp taken in thread_invoke()
3377 		 * - We want to avoid holding the thread lock while doing the
3378 		 *   callout
3379 		 * - We do not want to callout if "thread" is NULL.
3380 		 */
3381 		thread_csw_callout(thread, self, processor->last_dispatch);
3382 
3383 #if KASAN
3384 		if (thread->continuation != NULL) {
3385 			/*
3386 			 * Thread has a continuation and the normal stack is going away.
3387 			 * Unpoison the stack and mark all fakestack objects as unused.
3388 			 */
3389 #if KASAN_CLASSIC
3390 			kasan_fakestack_drop(thread);
3391 #endif /* KASAN_CLASSIC */
3392 			if (thread->kernel_stack) {
3393 				kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3394 			}
3395 		}
3396 
3397 
3398 #if KASAN_CLASSIC
3399 		/*
3400 		 * Free all unused fakestack objects.
3401 		 */
3402 		kasan_fakestack_gc(thread);
3403 #endif /* KASAN_CLASSIC */
3404 #endif /* KASAN */
3405 
3406 		/*
3407 		 *	If blocked at a continuation, discard
3408 		 *	the stack.
3409 		 */
3410 		if (thread->continuation != NULL && thread->kernel_stack != 0) {
3411 			stack_free(thread);
3412 		}
3413 
3414 		if (thread->state & TH_IDLE) {
3415 			was_idle = true;
3416 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3417 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3418 			    (uintptr_t)thread_tid(thread), 0, thread->state,
3419 			    sched_run_buckets[TH_BUCKET_RUN], 0);
3420 		} else {
3421 			int64_t consumed;
3422 			int64_t remainder = 0;
3423 
3424 			if (processor->quantum_end > processor->last_dispatch) {
3425 				remainder = processor->quantum_end -
3426 				    processor->last_dispatch;
3427 			}
3428 
3429 			consumed = thread->quantum_remaining - remainder;
3430 
3431 			if ((thread->reason & AST_LEDGER) == 0) {
3432 				/*
3433 				 * Bill CPU time to both the task and
3434 				 * the individual thread.
3435 				 */
3436 				ledger_credit_thread(thread, thread->t_ledger,
3437 				    task_ledgers.cpu_time, consumed);
3438 				ledger_credit_thread(thread, thread->t_threadledger,
3439 				    thread_ledgers.cpu_time, consumed);
3440 				if (thread->t_bankledger) {
3441 					ledger_credit_thread(thread, thread->t_bankledger,
3442 					    bank_ledgers.cpu_time,
3443 					    (consumed - thread->t_deduct_bank_ledger_time));
3444 				}
3445 				thread->t_deduct_bank_ledger_time = 0;
3446 				if (consumed > 0) {
3447 					/*
3448 					 * This should never be negative, but in traces we are seeing some instances
3449 					 * of consumed being negative.
3450 					 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3451 					 */
3452 					SCHED(update_pset_avg_execution_time)(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3453 				}
3454 			}
3455 
3456 			/* For the thread that we just context switched away from, figure
3457 			 * out if we have expired the wq quantum and set the AST if we have
3458 			 */
3459 			if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3460 				thread_evaluate_workqueue_quantum_expiry(thread);
3461 			}
3462 
3463 			if (__improbable(thread->rwlock_count != 0)) {
3464 				smr_mark_active_trackers_stalled(thread);
3465 			}
3466 
3467 			/*
3468 			 * Pairs with task_restartable_ranges_synchronize
3469 			 */
3470 			wake_lock(thread);
3471 			thread_lock(thread);
3472 
3473 			/*
3474 			 * Same as ast_check(), in case we missed the IPI
3475 			 */
3476 			thread_reset_pcs_ack_IPI(thread);
3477 
3478 			/*
3479 			 * Apply a priority floor if the thread holds a kernel resource
3480 			 * or explicitly requested it.
3481 			 * Do this before checking starting_pri to avoid overpenalizing
3482 			 * repeated rwlock blockers.
3483 			 */
3484 			if (__improbable(thread->rwlock_count != 0)) {
3485 				lck_rw_set_promotion_locked(thread);
3486 			}
3487 			if (__improbable(thread->priority_floor_count != 0)) {
3488 				thread_floor_boost_set_promotion_locked(thread);
3489 			}
3490 
3491 			boolean_t keep_quantum = processor->first_timeslice;
3492 
3493 			/*
3494 			 * Treat a thread which has dropped priority since it got on core
3495 			 * as having expired its quantum.
3496 			 */
3497 			if (processor->starting_pri > thread->sched_pri) {
3498 				keep_quantum = FALSE;
3499 			}
3500 
3501 			/* Compute remainder of current quantum. */
3502 			if (keep_quantum &&
3503 			    processor->quantum_end > processor->last_dispatch) {
3504 				thread->quantum_remaining = (uint32_t)remainder;
3505 			} else {
3506 				thread->quantum_remaining = 0;
3507 			}
3508 
3509 			if (thread->sched_mode == TH_MODE_REALTIME) {
3510 				/*
3511 				 *	Cancel the deadline if the thread has
3512 				 *	consumed the entire quantum.
3513 				 */
3514 				if (thread->quantum_remaining == 0) {
3515 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3516 					    (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3517 					thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3518 				}
3519 			} else {
3520 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3521 				/*
3522 				 *	For non-realtime threads treat a tiny
3523 				 *	remaining quantum as an expired quantum
3524 				 *	but include what's left next time.
3525 				 */
3526 				if (thread->quantum_remaining < min_std_quantum) {
3527 					thread->reason |= AST_QUANTUM;
3528 					thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3529 				}
3530 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3531 			}
3532 
3533 			/*
3534 			 *	If we are doing a direct handoff then
3535 			 *	take the remainder of the quantum.
3536 			 */
3537 			if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3538 				self->quantum_remaining = thread->quantum_remaining;
3539 				thread->reason |= AST_QUANTUM;
3540 				thread->quantum_remaining = 0;
3541 			}
3542 
3543 			thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3544 
3545 			if (!(thread->state & TH_WAIT)) {
3546 				/*
3547 				 *	Still runnable.
3548 				 */
3549 				thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3550 
3551 				machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3552 
3553 				ast_t reason = thread->reason;
3554 				sched_options_t options = SCHED_NONE;
3555 
3556 				if (reason & AST_REBALANCE) {
3557 					options |= SCHED_REBALANCE;
3558 					if (reason & AST_QUANTUM) {
3559 						/*
3560 						 * Having gone to the trouble of forcing this thread off a less preferred core,
3561 						 * we should force the preferable core to reschedule immediately to give this
3562 						 * thread a chance to run instead of just sitting on the run queue where
3563 						 * it may just be stolen back by the idle core we just forced it off.
3564 						 * But only do this at the end of a quantum to prevent cascading effects.
3565 						 */
3566 						options |= SCHED_STIR_POT;
3567 					}
3568 				}
3569 
3570 				if (reason & AST_QUANTUM) {
3571 					options |= SCHED_TAILQ;
3572 				} else if (reason & AST_PREEMPT) {
3573 					options |= SCHED_HEADQ;
3574 				} else {
3575 					options |= (SCHED_PREEMPT | SCHED_TAILQ);
3576 				}
3577 
3578 				thread_setrun(thread, options);
3579 
3580 				KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3581 				    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3582 				    (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3583 				    sched_run_buckets[TH_BUCKET_RUN], 0);
3584 
3585 				if (thread->wake_active) {
3586 					thread->wake_active = FALSE;
3587 					thread_unlock(thread);
3588 
3589 					thread_wakeup(&thread->wake_active);
3590 				} else {
3591 					thread_unlock(thread);
3592 				}
3593 
3594 				wake_unlock(thread);
3595 			} else {
3596 				/*
3597 				 *	Waiting.
3598 				 */
3599 				boolean_t should_terminate = FALSE;
3600 				uint32_t new_run_count;
3601 				int thread_state = thread->state;
3602 
3603 				/* Only the first call to thread_dispatch
3604 				 * after explicit termination should add
3605 				 * the thread to the termination queue
3606 				 */
3607 				if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3608 					should_terminate = TRUE;
3609 					thread_state |= TH_TERMINATE2;
3610 				}
3611 
3612 				timer_stop(&thread->runnable_timer, processor->last_dispatch);
3613 
3614 				thread_state &= ~TH_RUN;
3615 				thread->state = thread_state;
3616 
3617 				thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3618 				thread->chosen_processor = PROCESSOR_NULL;
3619 
3620 				new_run_count = SCHED(run_count_decr)(thread);
3621 
3622 #if CONFIG_SCHED_AUTO_JOIN
3623 				if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3624 					work_interval_auto_join_unwind(thread);
3625 				}
3626 #endif /* CONFIG_SCHED_AUTO_JOIN */
3627 
3628 #if CONFIG_SCHED_SFI
3629 				if (thread->reason & AST_SFI) {
3630 					thread->wait_sfi_begin_time = processor->last_dispatch;
3631 				}
3632 #endif
3633 				machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3634 
3635 				KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3636 				    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3637 				    (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3638 				    new_run_count, 0);
3639 
3640 				if (thread_state & TH_WAIT_REPORT) {
3641 					(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3642 				}
3643 
3644 				if (thread->wake_active) {
3645 					thread->wake_active = FALSE;
3646 					thread_unlock(thread);
3647 
3648 					thread_wakeup(&thread->wake_active);
3649 				} else {
3650 					thread_unlock(thread);
3651 				}
3652 
3653 				wake_unlock(thread);
3654 
3655 				if (should_terminate) {
3656 					thread_terminate_enqueue(thread);
3657 				}
3658 			}
3659 		}
3660 		/*
3661 		 * The thread could have been added to the termination queue, so it's
3662 		 * unsafe to use after this point.
3663 		 */
3664 		thread = THREAD_NULL;
3665 	}
3666 
3667 	int urgency = THREAD_URGENCY_NONE;
3668 	uint64_t latency = 0;
3669 
3670 	/* Update (new) current thread and reprogram running timers */
3671 	thread_lock(self);
3672 
3673 	if (!(self->state & TH_IDLE)) {
3674 		uint64_t        arg1, arg2;
3675 
3676 #if CONFIG_SCHED_SFI
3677 		ast_t                   new_ast;
3678 
3679 		new_ast = sfi_thread_needs_ast(self, NULL);
3680 
3681 		if (new_ast != AST_NONE) {
3682 			ast_on(new_ast);
3683 		}
3684 #endif
3685 
3686 		if (processor->last_dispatch < self->last_made_runnable_time) {
3687 			panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3688 			    processor->last_dispatch, self->last_made_runnable_time);
3689 		}
3690 
3691 		assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3692 
3693 		latency = processor->last_dispatch - self->last_made_runnable_time;
3694 		assert(latency >= self->same_pri_latency);
3695 
3696 		urgency = thread_get_urgency(self, &arg1, &arg2);
3697 
3698 		thread_tell_urgency(urgency, arg1, arg2, latency, self);
3699 
3700 		/*
3701 		 *	Start a new CPU limit interval if the previous one has
3702 		 *	expired. This should happen before initializing a new
3703 		 *	quantum.
3704 		 */
3705 		if (cpulimit_affects_quantum &&
3706 		    thread_cpulimit_interval_has_expired(processor->last_dispatch)) {
3707 			thread_cpulimit_restart(processor->last_dispatch);
3708 		}
3709 
3710 		/*
3711 		 *	Get a new quantum if none remaining.
3712 		 */
3713 		if (self->quantum_remaining == 0) {
3714 			thread_quantum_init(self, processor->last_dispatch);
3715 		}
3716 
3717 		/*
3718 		 *	Set up quantum timer and timeslice.
3719 		 */
3720 		processor->quantum_end = processor->last_dispatch +
3721 		    self->quantum_remaining;
3722 
3723 		running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
3724 		    processor->quantum_end, processor->last_dispatch);
3725 		if (was_idle) {
3726 			/*
3727 			 * kperf's running timer is active whenever the idle thread for a
3728 			 * CPU is not running.
3729 			 */
3730 			kperf_running_setup(processor, processor->last_dispatch);
3731 		}
3732 		running_timers_activate(processor);
3733 		processor->first_timeslice = TRUE;
3734 	} else {
3735 		if (!processor_bootstrap) {
3736 			running_timers_deactivate(processor);
3737 		}
3738 		processor->first_timeslice = FALSE;
3739 		thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
3740 	}
3741 
3742 	assert(self->block_hint == kThreadWaitNone);
3743 	self->computation_epoch = processor->last_dispatch;
3744 	/*
3745 	 * This relies on the interrupt time being tallied up to the thread in the
3746 	 * exception handler epilogue, which is before AST context where preemption
3747 	 * is considered (and the scheduler is potentially invoked to
3748 	 * context switch, here).
3749 	 */
3750 	self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach();
3751 	self->reason = AST_NONE;
3752 	processor->starting_pri = self->sched_pri;
3753 
3754 	thread_unlock(self);
3755 
3756 	machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
3757 	    processor->last_dispatch);
3758 
3759 #if defined(CONFIG_SCHED_DEFERRED_AST)
3760 	/*
3761 	 * TODO: Can we state that redispatching our old thread is also
3762 	 * uninteresting?
3763 	 */
3764 	if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3765 		pset_cancel_deferred_dispatch(processor->processor_set, processor);
3766 	}
3767 #endif
3768 }
3769 
3770 /*
3771  *	thread_block_reason:
3772  *
3773  *	Forces a reschedule, blocking the caller if a wait
3774  *	has been asserted.
3775  *
3776  *	If a continuation is specified, then thread_invoke will
3777  *	attempt to discard the thread's kernel stack.  When the
3778  *	thread resumes, it will execute the continuation function
3779  *	on a new kernel stack.
3780  */
3781 __mockable wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)3782 thread_block_reason(
3783 	thread_continue_t       continuation,
3784 	void                            *parameter,
3785 	ast_t                           reason)
3786 {
3787 	thread_t        self = current_thread();
3788 	processor_t     processor;
3789 	thread_t        new_thread;
3790 	spl_t           s;
3791 
3792 	s = splsched();
3793 
3794 	processor = current_processor();
3795 
3796 	/* If we're explicitly yielding, force a subsequent quantum */
3797 	if (reason & AST_YIELD) {
3798 		processor->first_timeslice = FALSE;
3799 	}
3800 
3801 	/* We're handling all scheduling AST's */
3802 	ast_off(AST_SCHEDULING);
3803 
3804 	clear_pending_nonurgent_preemption(processor);
3805 
3806 #if PROC_REF_DEBUG
3807 	if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3808 		uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3809 	}
3810 #endif
3811 
3812 #if CONFIG_EXCLAVES
3813 	if (continuation != NULL) {
3814 		assert3u(self->th_exclaves_state & TH_EXCLAVES_STATE_ANY, ==, 0);
3815 	}
3816 #endif /* CONFIG_EXCLAVES */
3817 
3818 	self->continuation = continuation;
3819 	self->parameter = parameter;
3820 
3821 	if (self->state & ~(TH_RUN | TH_IDLE)) {
3822 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3823 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3824 		    reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3825 	}
3826 
3827 	do {
3828 		thread_lock(self);
3829 		new_thread = thread_select(self, processor, &reason);
3830 		thread_unlock(self);
3831 	} while (!thread_invoke(self, new_thread, reason));
3832 
3833 	splx(s);
3834 
3835 	return self->wait_result;
3836 }
3837 
3838 /*
3839  *	thread_block:
3840  *
3841  *	Block the current thread if a wait has been asserted.
3842  */
3843 wait_result_t
thread_block(thread_continue_t continuation)3844 thread_block(
3845 	thread_continue_t       continuation)
3846 {
3847 	return thread_block_reason(continuation, NULL, AST_NONE);
3848 }
3849 
3850 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)3851 thread_block_parameter(
3852 	thread_continue_t       continuation,
3853 	void                            *parameter)
3854 {
3855 	return thread_block_reason(continuation, parameter, AST_NONE);
3856 }
3857 
3858 /*
3859  *	thread_run:
3860  *
3861  *	Switch directly from the current thread to the
3862  *	new thread, handing off our quantum if appropriate.
3863  *
3864  *	New thread must be runnable, and not on a run queue.
3865  *
3866  *	Called at splsched.
3867  */
3868 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)3869 thread_run(
3870 	thread_t                        self,
3871 	thread_continue_t       continuation,
3872 	void                            *parameter,
3873 	thread_t                        new_thread)
3874 {
3875 	ast_t reason = AST_NONE;
3876 
3877 	if ((self->state & TH_IDLE) == 0) {
3878 		reason = AST_HANDOFF;
3879 	}
3880 
3881 	/* Must not get here without a chosen processor */
3882 	assert(new_thread->chosen_processor);
3883 
3884 	self->continuation = continuation;
3885 	self->parameter = parameter;
3886 
3887 	while (!thread_invoke(self, new_thread, reason)) {
3888 		/* the handoff failed, so we have to fall back to the normal block path */
3889 		processor_t processor = current_processor();
3890 
3891 		reason = AST_NONE;
3892 
3893 		thread_lock(self);
3894 		new_thread = thread_select(self, processor, &reason);
3895 		thread_unlock(self);
3896 	}
3897 
3898 	return self->wait_result;
3899 }
3900 
3901 /*
3902  *	thread_continue:
3903  *
3904  *	Called at splsched when a thread first receives
3905  *	a new stack after a continuation.
3906  *
3907  *	Called with THREAD_NULL as the old thread when
3908  *	invoked by machine_load_context.
3909  */
3910 void
thread_continue(thread_t thread)3911 thread_continue(
3912 	thread_t        thread)
3913 {
3914 	thread_t                self = current_thread();
3915 	thread_continue_t       continuation;
3916 	void                    *parameter;
3917 
3918 	DTRACE_SCHED(on__cpu);
3919 
3920 	continuation = self->continuation;
3921 	parameter = self->parameter;
3922 
3923 	assert(continuation != NULL);
3924 
3925 #if KPERF
3926 	kperf_on_cpu(self, continuation, NULL);
3927 #endif
3928 
3929 
3930 	thread_dispatch(thread, self);
3931 
3932 	self->continuation = self->parameter = NULL;
3933 
3934 #if SCHED_HYGIENE_DEBUG
3935 	/* Reset interrupt-masked spin debugging timeout */
3936 	ml_spin_debug_clear(self);
3937 #endif
3938 
3939 	TLOG(1, "thread_continue: calling call_continuation\n");
3940 
3941 	boolean_t enable_interrupts = TRUE;
3942 
3943 	/* bootstrap thread, idle thread need to stay interrupts-disabled */
3944 	if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
3945 		enable_interrupts = FALSE;
3946 	}
3947 
3948 #if KASAN_TBI
3949 	kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3950 #endif /* KASAN_TBI */
3951 
3952 
3953 	call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
3954 	/*NOTREACHED*/
3955 }
3956 
3957 void
thread_quantum_init(thread_t thread,uint64_t now)3958 thread_quantum_init(thread_t thread, uint64_t now)
3959 {
3960 	uint64_t new_quantum = 0;
3961 
3962 	switch (thread->sched_mode) {
3963 	case TH_MODE_REALTIME:
3964 		new_quantum = thread->realtime.computation;
3965 		new_quantum = MIN(new_quantum, max_unsafe_rt_computation);
3966 		break;
3967 
3968 	case TH_MODE_FIXED:
3969 		new_quantum = SCHED(initial_quantum_size)(thread);
3970 		new_quantum = MIN(new_quantum, max_unsafe_fixed_computation);
3971 		break;
3972 
3973 	default:
3974 		new_quantum = SCHED(initial_quantum_size)(thread);
3975 		break;
3976 	}
3977 
3978 	if (cpulimit_affects_quantum) {
3979 		const uint64_t cpulimit_remaining = thread_cpulimit_remaining(now);
3980 
3981 		/*
3982 		 * If there's no remaining CPU time, the ledger system will
3983 		 * notice and put the thread to sleep.
3984 		 */
3985 		if (cpulimit_remaining > 0) {
3986 			new_quantum = MIN(new_quantum, cpulimit_remaining);
3987 		}
3988 	}
3989 
3990 	assert3u(new_quantum, <, UINT32_MAX);
3991 	assert3u(new_quantum, >, 0);
3992 
3993 	thread->quantum_remaining = (uint32_t)new_quantum;
3994 }
3995 
3996 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)3997 sched_timeshare_initial_quantum_size(thread_t thread)
3998 {
3999 	if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
4000 		return bg_quantum;
4001 	} else {
4002 		return std_quantum;
4003 	}
4004 }
4005 
4006 /*
4007  *	run_queue_init:
4008  *
4009  *	Initialize a run queue before first use.
4010  */
4011 void
run_queue_init(run_queue_t rq)4012 run_queue_init(
4013 	run_queue_t             rq)
4014 {
4015 	rq->highq = NOPRI;
4016 	for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
4017 		rq->bitmap[i] = 0;
4018 	}
4019 	rq->urgency = rq->count = 0;
4020 	for (int i = 0; i < NRQS; i++) {
4021 		circle_queue_init(&rq->queues[i]);
4022 	}
4023 }
4024 
4025 /*
4026  *	run_queue_dequeue:
4027  *
4028  *	Perform a dequeue operation on a run queue,
4029  *	and return the resulting thread.
4030  *
4031  *	The run queue must be locked (see thread_run_queue_remove()
4032  *	for more info), and not empty.
4033  */
4034 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)4035 run_queue_dequeue(
4036 	run_queue_t     rq,
4037 	sched_options_t options)
4038 {
4039 	thread_t        thread;
4040 	circle_queue_t  queue = &rq->queues[rq->highq];
4041 
4042 	if (options & SCHED_HEADQ) {
4043 		thread = cqe_dequeue_head(queue, struct thread, runq_links);
4044 	} else {
4045 		thread = cqe_dequeue_tail(queue, struct thread, runq_links);
4046 	}
4047 
4048 	assert(thread != THREAD_NULL);
4049 	assert_thread_magic(thread);
4050 
4051 	thread_clear_runq(thread);
4052 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4053 	rq->count--;
4054 	if (SCHED(priority_is_urgent)(rq->highq)) {
4055 		rq->urgency--; assert(rq->urgency >= 0);
4056 	}
4057 	if (circle_queue_empty(queue)) {
4058 		bitmap_clear(rq->bitmap, rq->highq);
4059 		rq->highq = bitmap_first(rq->bitmap, NRQS);
4060 	}
4061 
4062 	return thread;
4063 }
4064 
4065 /*
4066  *	run_queue_enqueue:
4067  *
4068  *	Perform a enqueue operation on a run queue.
4069  *
4070  *	The run queue must be locked (see thread_run_queue_remove()
4071  *	for more info).
4072  */
4073 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)4074 run_queue_enqueue(
4075 	run_queue_t      rq,
4076 	thread_t         thread,
4077 	sched_options_t  options)
4078 {
4079 	circle_queue_t  queue = &rq->queues[thread->sched_pri];
4080 	boolean_t       result = FALSE;
4081 
4082 	assert_thread_magic(thread);
4083 
4084 	if (circle_queue_empty(queue)) {
4085 		circle_enqueue_tail(queue, &thread->runq_links);
4086 
4087 		rq_bitmap_set(rq->bitmap, thread->sched_pri);
4088 		if (thread->sched_pri > rq->highq) {
4089 			rq->highq = thread->sched_pri;
4090 			result = TRUE;
4091 		}
4092 	} else {
4093 		if (options & SCHED_TAILQ) {
4094 			circle_enqueue_tail(queue, &thread->runq_links);
4095 		} else {
4096 			circle_enqueue_head(queue, &thread->runq_links);
4097 		}
4098 	}
4099 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4100 		rq->urgency++;
4101 	}
4102 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4103 	rq->count++;
4104 
4105 	return result;
4106 }
4107 
4108 /*
4109  *	run_queue_remove:
4110  *
4111  *	Remove a specific thread from a runqueue.
4112  *
4113  *	The run queue must be locked.
4114  */
4115 void
run_queue_remove(run_queue_t rq,thread_t thread)4116 run_queue_remove(
4117 	run_queue_t    rq,
4118 	thread_t       thread)
4119 {
4120 	circle_queue_t  queue = &rq->queues[thread->sched_pri];
4121 
4122 	thread_assert_runq_nonnull(thread);
4123 	assert_thread_magic(thread);
4124 
4125 	circle_dequeue(queue, &thread->runq_links);
4126 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4127 	rq->count--;
4128 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4129 		rq->urgency--; assert(rq->urgency >= 0);
4130 	}
4131 
4132 	if (circle_queue_empty(queue)) {
4133 		/* update run queue status */
4134 		bitmap_clear(rq->bitmap, thread->sched_pri);
4135 		rq->highq = bitmap_first(rq->bitmap, NRQS);
4136 	}
4137 
4138 	thread_clear_runq(thread);
4139 }
4140 
4141 /*
4142  *      run_queue_peek
4143  *
4144  *      Peek at the runq and return the highest
4145  *      priority thread from the runq.
4146  *
4147  *	The run queue must be locked.
4148  */
4149 thread_t
run_queue_peek(run_queue_t rq)4150 run_queue_peek(
4151 	run_queue_t    rq)
4152 {
4153 	if (rq->count > 0) {
4154 		circle_queue_t queue = &rq->queues[rq->highq];
4155 		thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4156 		assert_thread_magic(thread);
4157 		return thread;
4158 	} else {
4159 		return THREAD_NULL;
4160 	}
4161 }
4162 
4163 /*
4164  *	realtime_setrun:
4165  *
4166  *	Dispatch a thread for realtime execution.
4167  *
4168  *	Thread must be locked.  Associated pset must
4169  *	be locked, and is returned unlocked.
4170  */
4171 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4172 realtime_setrun(
4173 	processor_t                     chosen_processor,
4174 	thread_t                        thread)
4175 {
4176 	processor_set_t pset = chosen_processor->processor_set;
4177 	pset_assert_locked(pset);
4178 	bool pset_is_locked = true;
4179 
4180 	int n_backup = 0;
4181 
4182 	if (thread->realtime.constraint <= rt_constraint_threshold) {
4183 		n_backup = sched_rt_n_backup_processors;
4184 	}
4185 	assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4186 
4187 	int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4188 	if (existing_backups > 0) {
4189 		n_backup = n_backup - existing_backups;
4190 		if (n_backup < 0) {
4191 			n_backup = 0;
4192 		}
4193 	}
4194 
4195 	sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4196 	processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4197 
4198 	thread->chosen_processor = chosen_processor;
4199 
4200 	/* <rdar://problem/15102234> */
4201 	assert(thread->bound_processor == PROCESSOR_NULL);
4202 
4203 	rt_runq_insert(chosen_processor, pset, thread);
4204 
4205 	processor_t processor = chosen_processor;
4206 
4207 	int count = 0;
4208 	for (int i = 0; i <= n_backup; i++) {
4209 		if (i == 0) {
4210 			ipi_type[i] = SCHED_IPI_NONE;
4211 			ipi_processor[i] = processor;
4212 			count++;
4213 
4214 			ast_t preempt = AST_NONE;
4215 			if (thread->sched_pri > processor->current_pri) {
4216 				preempt = (AST_PREEMPT | AST_URGENT);
4217 			} else if (thread->sched_pri == processor->current_pri) {
4218 				if (rt_deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4219 					preempt = (AST_PREEMPT | AST_URGENT);
4220 				}
4221 			}
4222 
4223 			if (preempt != AST_NONE) {
4224 				if (processor->state == PROCESSOR_IDLE) {
4225 					if (processor == current_processor()) {
4226 						pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4227 						ast_on(preempt);
4228 
4229 						if ((preempt & AST_URGENT) == AST_URGENT) {
4230 							if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4231 								KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4232 								    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4233 							}
4234 						}
4235 
4236 						if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4237 							bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4238 						}
4239 					} else {
4240 						ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4241 					}
4242 				} else if (processor->state == PROCESSOR_DISPATCHING) {
4243 					if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4244 						KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4245 						    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4246 					}
4247 				} else {
4248 					if (processor == current_processor()) {
4249 						ast_on(preempt);
4250 
4251 						if ((preempt & AST_URGENT) == AST_URGENT) {
4252 							if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4253 								KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4254 								    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4255 							}
4256 						}
4257 
4258 						if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4259 							bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4260 						}
4261 					} else {
4262 						ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4263 					}
4264 				}
4265 			} else {
4266 				/* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4267 			}
4268 		} else {
4269 			if (!pset_is_locked) {
4270 				pset_lock(pset);
4271 			}
4272 			ipi_type[i] = SCHED_IPI_NONE;
4273 			ipi_processor[i] = PROCESSOR_NULL;
4274 			rt_choose_next_processor_for_followup_IPI(pset, chosen_processor, &ipi_processor[i], &ipi_type[i]);
4275 			if (ipi_processor[i] == PROCESSOR_NULL) {
4276 				break;
4277 			}
4278 			count++;
4279 
4280 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4281 			    ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4282 #if CONFIG_SCHED_SMT
4283 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4284 			if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4285 				processor_t p0 = ipi_processor[0];
4286 				processor_t p1 = ipi_processor[1];
4287 				assert(p0 && p1);
4288 				if (p_is_good(p0) && p_is_good(p1)) {
4289 					/*
4290 					 * Both the chosen processor and the first backup are non-cpu0 primaries,
4291 					 * so there is no need for a 2nd backup processor.
4292 					 */
4293 					break;
4294 				}
4295 			}
4296 #endif /* CONFIG_SCHED_SMT */
4297 		}
4298 	}
4299 
4300 	if (pset_is_locked) {
4301 		pset_unlock(pset);
4302 	}
4303 
4304 	assert((count > 0) && (count <= (n_backup + 1)));
4305 	for (int i = 0; i < count; i++) {
4306 		assert(ipi_processor[i] != PROCESSOR_NULL);
4307 		sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4308 	}
4309 }
4310 
4311 
4312 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)4313 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4314     thread_t thread, __unused sched_ipi_event_t event)
4315 {
4316 #if defined(CONFIG_SCHED_DEFERRED_AST)
4317 #if CONFIG_THREAD_GROUPS
4318 	if (thread) {
4319 		struct thread_group *tg = thread_group_get(thread);
4320 		if (thread_group_uses_immediate_ipi(tg)) {
4321 			return SCHED_IPI_IMMEDIATE;
4322 		}
4323 	}
4324 #endif /* CONFIG_THREAD_GROUPS */
4325 	if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4326 		return SCHED_IPI_DEFERRED;
4327 	}
4328 #else /* CONFIG_SCHED_DEFERRED_AST */
4329 	(void) thread;
4330 	panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4331 #endif /* CONFIG_SCHED_DEFERRED_AST */
4332 	return SCHED_IPI_NONE;
4333 }
4334 
4335 /* Requires the destination pset lock to be held */
4336 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)4337 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4338 {
4339 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4340 	assert(dst != NULL);
4341 
4342 	processor_set_t pset = dst->processor_set;
4343 	if (current_processor() == dst) {
4344 		return SCHED_IPI_NONE;
4345 	}
4346 
4347 	bool dst_idle = (dst->state == PROCESSOR_IDLE);
4348 	if (dst_idle) {
4349 		pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
4350 	}
4351 
4352 	ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4353 	switch (ipi_type) {
4354 	case SCHED_IPI_NONE:
4355 		return SCHED_IPI_NONE;
4356 #if defined(CONFIG_SCHED_DEFERRED_AST)
4357 	case SCHED_IPI_DEFERRED:
4358 		bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4359 		break;
4360 #endif /* CONFIG_SCHED_DEFERRED_AST */
4361 	default:
4362 		if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4363 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4364 			    dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4365 		}
4366 		bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4367 		break;
4368 	}
4369 	return ipi_type;
4370 }
4371 
4372 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4373 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4374 {
4375 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4376 	boolean_t deferred_ipi_supported = false;
4377 	processor_set_t pset = dst->processor_set;
4378 
4379 #if defined(CONFIG_SCHED_DEFERRED_AST)
4380 	deferred_ipi_supported = true;
4381 #endif /* CONFIG_SCHED_DEFERRED_AST */
4382 
4383 	switch (event) {
4384 	case SCHED_IPI_EVENT_SPILL:
4385 	case SCHED_IPI_EVENT_SMT_REBAL:
4386 	case SCHED_IPI_EVENT_REBALANCE:
4387 	case SCHED_IPI_EVENT_BOUND_THR:
4388 	case SCHED_IPI_EVENT_RT_PREEMPT:
4389 		/*
4390 		 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4391 		 * scenarios use immediate IPIs always.
4392 		 */
4393 		ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4394 		break;
4395 	case SCHED_IPI_EVENT_PREEMPT:
4396 		/* In the preemption case, use immediate IPIs for RT threads */
4397 		if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4398 			ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4399 			break;
4400 		}
4401 
4402 		/*
4403 		 * For Non-RT threads preemption,
4404 		 * If the core is active, use immediate IPIs.
4405 		 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4406 		 */
4407 		if (deferred_ipi_supported && dst_idle) {
4408 			return sched_ipi_deferred_policy(pset, dst, thread, event);
4409 		}
4410 		ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4411 		break;
4412 	default:
4413 		panic("Unrecognized scheduler IPI event type %d", event);
4414 	}
4415 	assert(ipi_type != SCHED_IPI_NONE);
4416 	return ipi_type;
4417 }
4418 
4419 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)4420 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4421 {
4422 	switch (ipi) {
4423 	case SCHED_IPI_NONE:
4424 		break;
4425 	case SCHED_IPI_IDLE:
4426 		machine_signal_idle(dst);
4427 		break;
4428 	case SCHED_IPI_IMMEDIATE:
4429 		cause_ast_check(dst);
4430 		break;
4431 	case SCHED_IPI_DEFERRED:
4432 		machine_signal_idle_deferred(dst);
4433 		break;
4434 	default:
4435 		panic("Unrecognized scheduler IPI type: %d", ipi);
4436 	}
4437 }
4438 
4439 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4440 
4441 boolean_t
priority_is_urgent(int priority)4442 priority_is_urgent(int priority)
4443 {
4444 	return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
4445 }
4446 
4447 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4448 
4449 /*
4450  *	processor_setrun:
4451  *
4452  *	Dispatch a thread for execution on a
4453  *	processor.
4454  *
4455  *	Thread must be locked.  Associated pset must
4456  *	be locked, and is returned unlocked.
4457  */
4458 static void
processor_setrun(processor_t processor,thread_t thread,sched_options_t options)4459 processor_setrun(
4460 	processor_t                     processor,
4461 	thread_t                        thread,
4462 	sched_options_t                 options)
4463 {
4464 	processor_set_t pset = processor->processor_set;
4465 	pset_assert_locked(pset);
4466 	ast_t preempt = AST_NONE;
4467 	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4468 
4469 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4470 
4471 	thread->chosen_processor = processor;
4472 
4473 	/*
4474 	 *	Set preemption mode.
4475 	 */
4476 #if defined(CONFIG_SCHED_DEFERRED_AST)
4477 	/* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4478 #endif
4479 	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4480 		preempt = (AST_PREEMPT | AST_URGENT);
4481 	} else if (processor->current_is_eagerpreempt) {
4482 		preempt = (AST_PREEMPT | AST_URGENT);
4483 	} else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4484 		if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4485 			preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4486 		} else {
4487 			preempt = AST_NONE;
4488 		}
4489 	} else {
4490 		preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4491 	}
4492 
4493 	if ((options & SCHED_STIR_POT) ||
4494 	    ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE))) {
4495 		/*
4496 		 * Having gone to the trouble of forcing this thread off a less preferred core,
4497 		 * we should force the preferable core to reschedule immediately to give this
4498 		 * thread a chance to run instead of just sitting on the run queue where
4499 		 * it may just be stolen back by the idle core we just forced it off.
4500 		 */
4501 		preempt |= AST_PREEMPT;
4502 	}
4503 
4504 	SCHED(processor_enqueue)(processor, thread, options);
4505 	SCHED(update_pset_load_average)(pset, 0);
4506 
4507 	if (preempt != AST_NONE) {
4508 		if (processor->state == PROCESSOR_IDLE) {
4509 			ipi_action = eExitIdle;
4510 		} else if (processor->state == PROCESSOR_DISPATCHING) {
4511 			if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4512 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4513 				    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4514 			}
4515 		} else if (processor->state == PROCESSOR_RUNNING &&
4516 		    (thread->sched_pri >= processor->current_pri)) {
4517 			ipi_action = eInterruptRunning;
4518 		}
4519 	} else {
4520 		/*
4521 		 * New thread is not important enough to preempt what is running, but
4522 		 * special processor states may need special handling
4523 		 */
4524 		if (processor->state == PROCESSOR_IDLE) {
4525 			ipi_action = eExitIdle;
4526 		} else if (processor->state == PROCESSOR_DISPATCHING) {
4527 			if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4528 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4529 				    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
4530 			}
4531 		}
4532 	}
4533 
4534 	if (ipi_action != eDoNothing) {
4535 		if (processor == current_processor()) {
4536 			if (ipi_action == eExitIdle) {
4537 				pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4538 			}
4539 			if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
4540 				ast_on(preempt);
4541 			}
4542 
4543 			if ((preempt & AST_URGENT) == AST_URGENT) {
4544 				if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4545 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4546 					    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
4547 				}
4548 			} else {
4549 				if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4550 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
4551 				}
4552 			}
4553 
4554 			if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4555 				bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4556 			} else {
4557 				bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4558 			}
4559 		} else {
4560 			sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
4561 			ipi_type = sched_ipi_action(processor, thread, event);
4562 		}
4563 	}
4564 
4565 	pset_unlock(pset);
4566 	sched_ipi_perform(processor, ipi_type);
4567 
4568 	if (ipi_action != eDoNothing && processor == current_processor()) {
4569 		ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
4570 		ast_on(new_preempt);
4571 	}
4572 }
4573 
4574 /*
4575  *	choose_next_pset:
4576  *
4577  *	Return the next sibling pset containing
4578  *	available processors.
4579  *
4580  *	Returns the original pset if none other is
4581  *	suitable.
4582  */
4583 static processor_set_t
choose_next_pset(processor_set_t pset)4584 choose_next_pset(
4585 	processor_set_t         pset)
4586 {
4587 	processor_set_t         nset = pset;
4588 
4589 	do {
4590 		nset = next_pset(nset);
4591 
4592 		/*
4593 		 * Sometimes during startup the pset_map can contain a bit
4594 		 * for a pset that isn't fully published in pset_array because
4595 		 * the pset_map read isn't an acquire load.
4596 		 *
4597 		 * In order to avoid needing an acquire barrier here, just bail
4598 		 * out.
4599 		 */
4600 		if (nset == PROCESSOR_SET_NULL) {
4601 			return pset;
4602 		}
4603 	} while (nset->online_processor_count < 1 && nset != pset);
4604 
4605 	return nset;
4606 }
4607 
4608 #if CONFIG_SCHED_SMT
4609 /*
4610  *	choose_processor_smt:
4611  *
4612  *  SMT-aware implementation of choose_processor.
4613  */
4614 processor_t
choose_processor_smt(processor_set_t starting_pset,processor_t processor,thread_t thread,__unused sched_options_t * options)4615 choose_processor_smt(
4616 	processor_set_t         starting_pset,
4617 	processor_t             processor,
4618 	thread_t                thread,
4619 	__unused sched_options_t *options)
4620 {
4621 	processor_set_t pset = starting_pset;
4622 	processor_set_t nset;
4623 
4624 	assert(thread->sched_pri <= MAXPRI);
4625 
4626 	/*
4627 	 * Prefer the hinted processor, when appropriate.
4628 	 */
4629 
4630 	/* Fold last processor hint from secondary processor to its primary */
4631 	if (processor != PROCESSOR_NULL) {
4632 		processor = processor->processor_primary;
4633 	}
4634 
4635 	/*
4636 	 * Only consult platform layer if pset is active, which
4637 	 * it may not be in some cases when a multi-set system
4638 	 * is going to sleep.
4639 	 */
4640 	if (pset->online_processor_count) {
4641 		if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
4642 			processor_t mc_processor = machine_choose_processor(pset, processor);
4643 			if (mc_processor != PROCESSOR_NULL) {
4644 				processor = mc_processor->processor_primary;
4645 			}
4646 		}
4647 	}
4648 
4649 	/*
4650 	 * At this point, we may have a processor hint, and we may have
4651 	 * an initial starting pset. If the hint is not in the pset, or
4652 	 * if the hint is for a processor in an invalid state, discard
4653 	 * the hint.
4654 	 */
4655 	if (processor != PROCESSOR_NULL) {
4656 		if (processor->processor_set != pset) {
4657 			processor = PROCESSOR_NULL;
4658 		} else if (!processor->is_recommended) {
4659 			processor = PROCESSOR_NULL;
4660 		} else {
4661 			switch (processor->state) {
4662 			case PROCESSOR_START:
4663 			case PROCESSOR_PENDING_OFFLINE:
4664 			case PROCESSOR_OFF_LINE:
4665 				/*
4666 				 * Hint is for a processor that cannot support running new threads.
4667 				 */
4668 				processor = PROCESSOR_NULL;
4669 				break;
4670 			case PROCESSOR_IDLE:
4671 				/*
4672 				 * Hint is for an idle processor. Assume it is no worse than any other
4673 				 * idle processor. The platform layer had an opportunity to provide
4674 				 * the "least cost idle" processor above.
4675 				 */
4676 				if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
4677 					uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
4678 					uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
4679 					/*
4680 					 * If the rotation bitmask to force a migration is set for this core and there's an idle core that
4681 					 * that needn't be avoided, don't continue running on the same core.
4682 					 */
4683 					if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_primary_map != 0)) {
4684 						return processor;
4685 					}
4686 				}
4687 				processor = PROCESSOR_NULL;
4688 				break;
4689 			case PROCESSOR_RUNNING:
4690 			case PROCESSOR_DISPATCHING:
4691 				/*
4692 				 * Hint is for an active CPU. This fast-path allows
4693 				 * realtime threads to preempt non-realtime threads
4694 				 * to regain their previous executing processor.
4695 				 */
4696 				if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4697 					if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
4698 						return processor;
4699 					}
4700 					processor = PROCESSOR_NULL;
4701 				}
4702 
4703 				/* Otherwise, use hint as part of search below */
4704 				break;
4705 			default:
4706 				processor = PROCESSOR_NULL;
4707 				break;
4708 			}
4709 		}
4710 	}
4711 
4712 	/*
4713 	 * Iterate through the processor sets to locate
4714 	 * an appropriate processor. Seed results with
4715 	 * a last-processor hint, if available, so that
4716 	 * a search must find something strictly better
4717 	 * to replace it.
4718 	 *
4719 	 * A primary/secondary pair of SMT processors are
4720 	 * "unpaired" if the primary is busy but its
4721 	 * corresponding secondary is idle (so the physical
4722 	 * core has full use of its resources).
4723 	 */
4724 
4725 	assert(pset == starting_pset);
4726 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4727 		return SCHED(rt_choose_processor)(pset, processor, thread);
4728 	}
4729 
4730 	/* No realtime threads from this point on */
4731 	assert(thread->sched_pri < BASEPRI_RTQUEUES);
4732 
4733 	integer_t lowest_priority = MAXPRI + 1;
4734 	integer_t lowest_secondary_priority = MAXPRI + 1;
4735 	integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
4736 	integer_t lowest_idle_secondary_priority = MAXPRI + 1;
4737 	integer_t lowest_count = INT_MAX;
4738 	processor_t lp_processor = PROCESSOR_NULL;
4739 	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
4740 	processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
4741 	processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
4742 	processor_t lc_processor = PROCESSOR_NULL;
4743 
4744 	if (processor != PROCESSOR_NULL) {
4745 		/* All other states should be enumerated above. */
4746 		assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
4747 		assert(thread->sched_pri < BASEPRI_RTQUEUES);
4748 
4749 		lowest_priority = processor->current_pri;
4750 		lp_processor = processor;
4751 
4752 		lowest_count = SCHED(processor_runq_count)(processor);
4753 		lc_processor = processor;
4754 	}
4755 
4756 	do {
4757 		/*
4758 		 * Choose an idle processor, in pset traversal order
4759 		 */
4760 		uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
4761 		uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask;
4762 
4763 		/* there shouldn't be a pending AST if the processor is idle */
4764 		assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
4765 
4766 		/*
4767 		 * Look at the preferred cores first.
4768 		 */
4769 		int cpuid = lsb_next(preferred_idle_primary_map, pset->cpu_preferred_last_chosen);
4770 		if (cpuid < 0) {
4771 			cpuid = lsb_first(preferred_idle_primary_map);
4772 		}
4773 		if (cpuid >= 0) {
4774 			processor = processor_array[cpuid];
4775 			pset->cpu_preferred_last_chosen = cpuid;
4776 			return processor;
4777 		}
4778 
4779 		/*
4780 		 * Look at the cores that don't need to be avoided next.
4781 		 */
4782 		if (pset->perfcontrol_cpu_migration_bitmask != 0) {
4783 			uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
4784 			cpuid = lsb_next(non_avoided_idle_primary_map, pset->cpu_preferred_last_chosen);
4785 			if (cpuid < 0) {
4786 				cpuid = lsb_first(non_avoided_idle_primary_map);
4787 			}
4788 			if (cpuid >= 0) {
4789 				processor = processor_array[cpuid];
4790 				pset->cpu_preferred_last_chosen = cpuid;
4791 				return processor;
4792 			}
4793 		}
4794 
4795 		/*
4796 		 * Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available.
4797 		 */
4798 		cpuid = lsb_first(idle_primary_map);
4799 		if (cpuid >= 0) {
4800 			processor = processor_array[cpuid];
4801 			return processor;
4802 		}
4803 
4804 		/*
4805 		 * Otherwise, enumerate active and idle processors to find primary candidates
4806 		 * with lower priority/etc.
4807 		 */
4808 
4809 		uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
4810 		    pset->recommended_bitmask &
4811 		    ~pset->pending_AST_URGENT_cpu_mask);
4812 
4813 		if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
4814 			active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
4815 		}
4816 
4817 		active_map = bit_ror64(active_map, (pset->last_chosen + 1));
4818 		for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
4819 			cpuid = ((rotid + pset->last_chosen + 1) & 63);
4820 			processor = processor_array[cpuid];
4821 
4822 			integer_t cpri = processor->current_pri;
4823 			processor_t primary = processor->processor_primary;
4824 			if (primary != processor) {
4825 				/* If primary is running a NO_SMT thread, don't choose its secondary */
4826 				if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
4827 					if (cpri < lowest_secondary_priority) {
4828 						lowest_secondary_priority = cpri;
4829 						lp_paired_secondary_processor = processor;
4830 					}
4831 				}
4832 			} else {
4833 				if (cpri < lowest_priority) {
4834 					lowest_priority = cpri;
4835 					lp_processor = processor;
4836 				}
4837 			}
4838 
4839 			integer_t ccount = SCHED(processor_runq_count)(processor);
4840 			if (ccount < lowest_count) {
4841 				lowest_count = ccount;
4842 				lc_processor = processor;
4843 			}
4844 		}
4845 
4846 		/*
4847 		 * For SMT configs, these idle secondary processors must have active primary. Otherwise
4848 		 * the idle primary would have short-circuited the loop above
4849 		 */
4850 		uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
4851 		    ~pset->primary_map &
4852 		    pset->recommended_bitmask);
4853 
4854 		/* there shouldn't be a pending AST if the processor is idle */
4855 		assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
4856 		assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
4857 
4858 		for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
4859 			processor = processor_array[cpuid];
4860 
4861 			processor_t cprimary = processor->processor_primary;
4862 
4863 			integer_t primary_pri = cprimary->current_pri;
4864 
4865 			/*
4866 			 * TODO: This should also make the same decisions
4867 			 * as secondary_can_run_realtime_thread
4868 			 *
4869 			 * TODO: Keep track of the pending preemption priority
4870 			 * of the primary to make this more accurate.
4871 			 */
4872 
4873 			/* If the primary is running a no-smt thread, then don't choose its secondary */
4874 			if (cprimary->state == PROCESSOR_RUNNING &&
4875 			    processor_active_thread_no_smt(cprimary)) {
4876 				continue;
4877 			}
4878 
4879 			/*
4880 			 * Find the idle secondary processor with the lowest priority primary
4881 			 *
4882 			 * We will choose this processor as a fallback if we find no better
4883 			 * primary to preempt.
4884 			 */
4885 			if (primary_pri < lowest_idle_secondary_priority) {
4886 				lp_idle_secondary_processor = processor;
4887 				lowest_idle_secondary_priority = primary_pri;
4888 			}
4889 
4890 			/* Find the the lowest priority active primary with idle secondary */
4891 			if (primary_pri < lowest_unpaired_primary_priority) {
4892 				/* If the primary processor is offline or starting up, it's not a candidate for this path */
4893 				if (cprimary->state != PROCESSOR_RUNNING &&
4894 				    cprimary->state != PROCESSOR_DISPATCHING) {
4895 					continue;
4896 				}
4897 
4898 				if (!cprimary->is_recommended) {
4899 					continue;
4900 				}
4901 
4902 				/* if the primary is pending preemption, don't try to re-preempt it */
4903 				if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
4904 					continue;
4905 				}
4906 
4907 				if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
4908 				    bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
4909 					continue;
4910 				}
4911 
4912 				lowest_unpaired_primary_priority = primary_pri;
4913 				lp_unpaired_primary_processor = cprimary;
4914 			}
4915 		}
4916 
4917 		/*
4918 		 * We prefer preempting a primary processor over waking up its secondary.
4919 		 * The secondary will then be woken up by the preempted thread.
4920 		 */
4921 		if (thread->sched_pri > lowest_unpaired_primary_priority) {
4922 			pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
4923 			return lp_unpaired_primary_processor;
4924 		}
4925 
4926 		/*
4927 		 * We prefer preempting a lower priority active processor over directly
4928 		 * waking up an idle secondary.
4929 		 * The preempted thread will then find the idle secondary.
4930 		 */
4931 		if (thread->sched_pri > lowest_priority) {
4932 			pset->last_chosen = lp_processor->cpu_id;
4933 			return lp_processor;
4934 		}
4935 
4936 		/*
4937 		 * lc_processor is used to indicate the best processor set run queue
4938 		 * on which to enqueue a thread when all available CPUs are busy with
4939 		 * higher priority threads, so try to make sure it is initialized.
4940 		 */
4941 		if (lc_processor == PROCESSOR_NULL) {
4942 			cpumap_t available_map = pset_available_cpumap(pset);
4943 			cpuid = lsb_first(available_map);
4944 			if (cpuid >= 0) {
4945 				lc_processor = processor_array[cpuid];
4946 				lowest_count = SCHED(processor_runq_count)(lc_processor);
4947 			}
4948 		}
4949 
4950 		/*
4951 		 * Move onto the next processor set.
4952 		 *
4953 		 * If all primary processors in this pset are running a higher
4954 		 * priority thread, move on to next pset. Only when we have
4955 		 * exhausted the search for primary processors do we
4956 		 * fall back to secondaries.
4957 		 */
4958 #if CONFIG_SCHED_EDGE
4959 		/*
4960 		 * The edge scheduler expects a CPU to be selected from the pset it passed in
4961 		 * as the starting pset for non-RT workloads. The edge migration algorithm
4962 		 * should already have considered idle CPUs and loads to decide the starting_pset;
4963 		 * which means that this loop can be short-circuted.
4964 		 */
4965 		nset = starting_pset;
4966 #else /* CONFIG_SCHED_EDGE */
4967 		nset = next_pset(pset);
4968 #endif /* CONFIG_SCHED_EDGE */
4969 
4970 		if (nset != starting_pset) {
4971 			pset = change_locked_pset(pset, nset);
4972 		}
4973 	} while (nset != starting_pset);
4974 
4975 	/*
4976 	 * Make sure that we pick a running processor,
4977 	 * and that the correct processor set is locked.
4978 	 * Since we may have unlocked the candidate processor's
4979 	 * pset, it may have changed state.
4980 	 *
4981 	 * All primary processors are running a higher priority
4982 	 * thread, so the only options left are enqueuing on
4983 	 * the secondary processor that would perturb the least priority
4984 	 * primary, or the least busy primary.
4985 	 */
4986 
4987 	/* lowest_priority is evaluated in the main loops above */
4988 	if (lp_idle_secondary_processor != PROCESSOR_NULL) {
4989 		processor = lp_idle_secondary_processor;
4990 	} else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
4991 		processor = lp_paired_secondary_processor;
4992 	} else if (lc_processor != PROCESSOR_NULL) {
4993 		processor = lc_processor;
4994 	} else {
4995 		processor = PROCESSOR_NULL;
4996 	}
4997 
4998 	if (processor) {
4999 		pset = change_locked_pset(pset, processor->processor_set);
5000 		/* Check that chosen processor is still usable */
5001 		cpumap_t available_map = pset_available_cpumap(pset);
5002 		if (bit_test(available_map, processor->cpu_id)) {
5003 			pset->last_chosen = processor->cpu_id;
5004 			return processor;
5005 		}
5006 
5007 		/* processor is no longer usable */
5008 		processor = PROCESSOR_NULL;
5009 	}
5010 
5011 	pset_assert_locked(pset);
5012 	pset_unlock(pset);
5013 	return PROCESSOR_NULL;
5014 }
5015 #else /* !CONFIG_SCHED_SMT */
5016 /*
5017  *	choose_processor:
5018  *
5019  *	Choose a processor for the thread, beginning at
5020  *	the pset.  Accepts an optional processor hint in
5021  *	the pset.
5022  *
5023  *	Returns a processor, possibly from a different pset.
5024  *
5025  *	The thread must be locked.  The pset must be locked,
5026  *	and the resulting pset is locked on return.
5027  */
5028 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread,__unused sched_options_t * options)5029 choose_processor(
5030 	processor_set_t         starting_pset,
5031 	processor_t             processor,
5032 	thread_t                thread,
5033 	__unused sched_options_t *options)
5034 {
5035 	processor_set_t pset = starting_pset;
5036 	processor_set_t nset;
5037 
5038 	assert3u(thread->sched_pri, <=, MAXPRI);
5039 
5040 	/*
5041 	 * At this point, we may have a processor hint, and we may have
5042 	 * an initial starting pset. If the hint is not in the pset, or
5043 	 * if the hint is for a processor in an invalid state, discard
5044 	 * the hint.
5045 	 */
5046 	if (processor != PROCESSOR_NULL) {
5047 		if (processor->processor_set != pset) {
5048 			processor = PROCESSOR_NULL;
5049 		} else if (!processor->is_recommended) {
5050 			processor = PROCESSOR_NULL;
5051 		} else {
5052 			switch (processor->state) {
5053 			case PROCESSOR_START:
5054 			case PROCESSOR_PENDING_OFFLINE:
5055 			case PROCESSOR_OFF_LINE:
5056 				/*
5057 				 * Hint is for a processor that cannot support running new threads.
5058 				 */
5059 				processor = PROCESSOR_NULL;
5060 				break;
5061 			case PROCESSOR_IDLE:
5062 				/*
5063 				 * Hint is for an idle processor. Assume it is no worse than any other
5064 				 * idle processor. The platform layer had an opportunity to provide
5065 				 * the "least cost idle" processor above.
5066 				 */
5067 				if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5068 					uint64_t idle_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->recommended_bitmask);
5069 					uint64_t non_avoided_idle_map = idle_map & ~pset->perfcontrol_cpu_migration_bitmask;
5070 					/*
5071 					 * If the rotation bitmask to force a migration is set for this core and there's an idle core that
5072 					 * that needn't be avoided, don't continue running on the same core.
5073 					 */
5074 					if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_map != 0)) {
5075 						return processor;
5076 					}
5077 				}
5078 				processor = PROCESSOR_NULL;
5079 				break;
5080 			case PROCESSOR_RUNNING:
5081 			case PROCESSOR_DISPATCHING:
5082 				/*
5083 				 * Hint is for an active CPU. This fast-path allows
5084 				 * realtime threads to preempt non-realtime threads
5085 				 * to regain their previous executing processor.
5086 				 */
5087 				if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5088 					if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5089 						return processor;
5090 					}
5091 					processor = PROCESSOR_NULL;
5092 				}
5093 
5094 				/* Otherwise, use hint as part of search below */
5095 				break;
5096 			default:
5097 				processor = PROCESSOR_NULL;
5098 				break;
5099 			}
5100 		}
5101 	}
5102 
5103 	/*
5104 	 * Iterate through the processor sets to locate
5105 	 * an appropriate processor. Seed results with
5106 	 * a last-processor hint, if available, so that
5107 	 * a search must find something strictly better
5108 	 * to replace it.
5109 	 */
5110 
5111 	assert(pset == starting_pset);
5112 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5113 		return SCHED(rt_choose_processor)(pset, processor, thread);
5114 	}
5115 
5116 	/* No realtime threads from this point on */
5117 	assert(thread->sched_pri < BASEPRI_RTQUEUES);
5118 
5119 	integer_t lowest_priority = MAXPRI + 1;
5120 	integer_t lowest_count = INT_MAX;
5121 	processor_t lp_processor = PROCESSOR_NULL;
5122 	processor_t lc_processor = PROCESSOR_NULL;
5123 
5124 	if (processor != PROCESSOR_NULL) {
5125 		/* All other states should be enumerated above. */
5126 		assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5127 		assert(thread->sched_pri < BASEPRI_RTQUEUES);
5128 
5129 		lowest_priority = processor->current_pri;
5130 		lp_processor = processor;
5131 
5132 		lowest_count = SCHED(processor_runq_count)(processor);
5133 		lc_processor = processor;
5134 	}
5135 
5136 
5137 	do {
5138 		/*
5139 		 * Choose an idle processor, in pset traversal order
5140 		 */
5141 		uint64_t idle_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->recommended_bitmask);
5142 		uint64_t preferred_idle_map = idle_map & pset->perfcontrol_cpu_preferred_bitmask;
5143 
5144 		/* there shouldn't be a pending AST if the processor is idle */
5145 		assert((idle_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5146 
5147 		/*
5148 		 * Look at the preferred cores first.
5149 		 */
5150 		int cpuid = lsb_next(preferred_idle_map, pset->cpu_preferred_last_chosen);
5151 		if (cpuid < 0) {
5152 			cpuid = lsb_first(preferred_idle_map);
5153 		}
5154 		if (cpuid >= 0) {
5155 			processor = processor_array[cpuid];
5156 			pset->cpu_preferred_last_chosen = cpuid;
5157 			return processor;
5158 		}
5159 
5160 		/*
5161 		 * Look at the cores that don't need to be avoided next.
5162 		 */
5163 		if (pset->perfcontrol_cpu_migration_bitmask != 0) {
5164 			uint64_t non_avoided_idle_map = idle_map & ~pset->perfcontrol_cpu_migration_bitmask;
5165 			cpuid = lsb_next(non_avoided_idle_map, pset->cpu_preferred_last_chosen);
5166 			if (cpuid < 0) {
5167 				cpuid = lsb_first(non_avoided_idle_map);
5168 			}
5169 			if (cpuid >= 0) {
5170 				processor = processor_array[cpuid];
5171 				pset->cpu_preferred_last_chosen = cpuid;
5172 				return processor;
5173 			}
5174 		}
5175 
5176 		/*
5177 		 * Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available.
5178 		 */
5179 		cpuid = lsb_first(idle_map);
5180 		if (cpuid >= 0) {
5181 			processor = processor_array[cpuid];
5182 			return processor;
5183 		}
5184 
5185 		/*
5186 		 * Otherwise, enumerate active and idle processors to find primary candidates
5187 		 * with lower priority/etc.
5188 		 */
5189 
5190 		uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5191 		    pset->recommended_bitmask &
5192 		    ~pset->pending_AST_URGENT_cpu_mask);
5193 
5194 		if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5195 			active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5196 		}
5197 
5198 		active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5199 		for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5200 			cpuid = ((rotid + pset->last_chosen + 1) & 63);
5201 			processor = processor_array[cpuid];
5202 
5203 			integer_t cpri = processor->current_pri;
5204 			if (cpri < lowest_priority) {
5205 				lowest_priority = cpri;
5206 				lp_processor = processor;
5207 			}
5208 
5209 			integer_t ccount = SCHED(processor_runq_count)(processor);
5210 			if (ccount < lowest_count) {
5211 				lowest_count = ccount;
5212 				lc_processor = processor;
5213 			}
5214 		}
5215 
5216 		/*
5217 		 * We prefer preempting a lower priority active processor over directly
5218 		 * waking up an idle secondary.
5219 		 * The preempted thread will then find the idle secondary.
5220 		 */
5221 		if (thread->sched_pri > lowest_priority) {
5222 			pset->last_chosen = lp_processor->cpu_id;
5223 			return lp_processor;
5224 		}
5225 
5226 		/*
5227 		 * lc_processor is used to indicate the best processor set run queue
5228 		 * on which to enqueue a thread when all available CPUs are busy with
5229 		 * higher priority threads, so try to make sure it is initialized.
5230 		 */
5231 		if (lc_processor == PROCESSOR_NULL) {
5232 			cpumap_t available_map = pset_available_cpumap(pset);
5233 			cpuid = lsb_first(available_map);
5234 			if (cpuid >= 0) {
5235 				lc_processor = processor_array[cpuid];
5236 				lowest_count = SCHED(processor_runq_count)(lc_processor);
5237 			}
5238 		}
5239 
5240 		/*
5241 		 * Move onto the next processor set.
5242 		 *
5243 		 * If all primary processors in this pset are running a higher
5244 		 * priority thread, move on to next pset. Only when we have
5245 		 * exhausted the search for primary processors do we
5246 		 * fall back to secondaries.
5247 		 */
5248 #if CONFIG_SCHED_EDGE
5249 		/*
5250 		 * The edge scheduler expects a CPU to be selected from the pset it passed in
5251 		 * as the starting pset for non-RT workloads. The edge migration algorithm
5252 		 * should already have considered idle CPUs and loads to decide the starting_pset;
5253 		 * which means that this loop can be short-circuted.
5254 		 */
5255 		nset = starting_pset;
5256 #else /* CONFIG_SCHED_EDGE */
5257 		nset = next_pset(pset);
5258 #endif /* CONFIG_SCHED_EDGE */
5259 
5260 		if (nset != starting_pset) {
5261 			pset = change_locked_pset(pset, nset);
5262 		}
5263 	} while (nset != starting_pset);
5264 
5265 	processor = lc_processor;
5266 
5267 	if (processor) {
5268 		pset = change_locked_pset(pset, processor->processor_set);
5269 		/* Check that chosen processor is still usable */
5270 		cpumap_t available_map = pset_available_cpumap(pset);
5271 		if (bit_test(available_map, processor->cpu_id)) {
5272 			pset->last_chosen = processor->cpu_id;
5273 			return processor;
5274 		}
5275 
5276 		/* processor is no longer usable */
5277 		processor = PROCESSOR_NULL;
5278 	}
5279 
5280 	pset_assert_locked(pset);
5281 	pset_unlock(pset);
5282 	return PROCESSOR_NULL;
5283 }
5284 #endif /* !CONFIG_SCHED_SMT */
5285 
5286 
5287 
5288 /*
5289  * Default implementation of SCHED(choose_node)()
5290  * for single node systems
5291  */
5292 pset_node_t
sched_choose_node(__unused thread_t thread)5293 sched_choose_node(__unused thread_t thread)
5294 {
5295 	return &pset_node0;
5296 }
5297 
5298 /*
5299  *	choose_starting_pset:
5300  *
5301  *	Choose a starting processor set for the thread.
5302  *	May return a processor hint within the pset.
5303  *
5304  *	Returns a starting processor set, to be used by
5305  *      choose_processor.
5306  *
5307  *	The thread must be locked.  The resulting pset is unlocked on return,
5308  *      and is chosen without taking any pset locks.
5309  */
5310 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)5311 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5312 {
5313 	processor_set_t pset;
5314 	processor_t processor = PROCESSOR_NULL;
5315 
5316 	if (thread->affinity_set != AFFINITY_SET_NULL) {
5317 		/*
5318 		 * Use affinity set policy hint.
5319 		 */
5320 		pset = thread->affinity_set->aset_pset;
5321 	} else if (thread->last_processor != PROCESSOR_NULL) {
5322 		/*
5323 		 *	Simple (last processor) affinity case.
5324 		 */
5325 		processor = thread->last_processor;
5326 		pset = processor->processor_set;
5327 	} else {
5328 		/*
5329 		 *	No Affinity case:
5330 		 *
5331 		 *	Utilitize a per task hint to spread threads
5332 		 *	among the available processor sets.
5333 		 * NRG this seems like the wrong thing to do.
5334 		 * See also task->pset_hint = pset in thread_setrun()
5335 		 */
5336 		pset = get_threadtask(thread)->pset_hint;
5337 		if (pset == PROCESSOR_SET_NULL) {
5338 			pset = current_processor()->processor_set;
5339 		}
5340 
5341 		pset = choose_next_pset(pset);
5342 	}
5343 
5344 	if (!bit_test(node->pset_map, pset->pset_id)) {
5345 		/* pset is not from this node so choose one that is */
5346 		pset = pset_for_id((pset_id_t)lsb_first(node->pset_map));
5347 	}
5348 
5349 	if (bit_count(node->pset_map) == 1) {
5350 		/* Only a single pset in this node */
5351 		goto out;
5352 	}
5353 
5354 	bool avoid_cpu0 = false;
5355 
5356 #if defined(__x86_64__)
5357 	if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5358 		/* Avoid the pset containing cpu0 */
5359 		avoid_cpu0 = true;
5360 		/* Assert that cpu0 is in pset 0.  I expect this to be true on __x86_64__ */
5361 		assert(bit_test(pset_for_id(0)->cpu_bitmask, 0));
5362 	}
5363 #endif
5364 
5365 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5366 		pset_map_t rt_target_map;
5367 #if CONFIG_SCHED_SMT
5368 		rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5369 		if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5370 			if (avoid_cpu0) {
5371 				rt_target_map = bit_ror64(rt_target_map, 1);
5372 			}
5373 			int rotid = lsb_first(rt_target_map);
5374 			if (rotid >= 0) {
5375 				int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5376 				pset = pset_array[id];
5377 				goto out;
5378 			}
5379 		}
5380 		if (!pset->is_SMT || !sched_allow_rt_smt) {
5381 			/* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5382 			goto out;
5383 		}
5384 #endif /* CONFIG_SCHED_SMT*/
5385 		rt_target_map = atomic_load(&node->pset_non_rt_map);
5386 		if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5387 			if (avoid_cpu0) {
5388 				rt_target_map = bit_ror64(rt_target_map, 1);
5389 			}
5390 			int rotid = lsb_first(rt_target_map);
5391 			if (rotid >= 0) {
5392 				int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5393 				pset = pset_array[id];
5394 				goto out;
5395 			}
5396 		}
5397 		/* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5398 	} else {
5399 		pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5400 		if (!bit_test(idle_map, pset->pset_id)) {
5401 			int next_idle_pset_id = lsb_first(idle_map);
5402 			if (next_idle_pset_id >= 0) {
5403 				pset = pset_array[next_idle_pset_id];
5404 			}
5405 		}
5406 	}
5407 
5408 out:
5409 	if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5410 		processor = PROCESSOR_NULL;
5411 	}
5412 	if (processor != PROCESSOR_NULL) {
5413 		*processor_hint = processor;
5414 	}
5415 
5416 	assert(pset != NULL);
5417 	return pset;
5418 }
5419 
5420 /*
5421  *	thread_setrun:
5422  *
5423  *	Dispatch thread for execution, onto an idle
5424  *	processor or run queue, and signal a preemption
5425  *	as appropriate.
5426  *
5427  *	Thread must be locked.
5428  */
5429 void
thread_setrun(thread_t thread,sched_options_t options)5430 thread_setrun(
5431 	thread_t                        thread,
5432 	sched_options_t                 options)
5433 {
5434 	processor_t                     processor = PROCESSOR_NULL;
5435 	processor_set_t         pset;
5436 
5437 	assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5438 	thread_assert_runq_null(thread);
5439 
5440 	simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_NOTOWNED);
5441 
5442 #if CONFIG_PREADOPT_TG
5443 	/* We know that the thread is not in the runq by virtue of being in this
5444 	 * function and the thread is not self since we are running. We can safely
5445 	 * resolve the thread group hierarchy and modify the thread's thread group
5446 	 * here. */
5447 	thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
5448 #endif
5449 
5450 	/*
5451 	 *	Update priority if needed.
5452 	 */
5453 	if (SCHED(can_update_priority)(thread)) {
5454 		SCHED(update_priority)(thread);
5455 	}
5456 	thread->sfi_class = sfi_thread_classify(thread);
5457 
5458 	if (thread->bound_processor == PROCESSOR_NULL) {
5459 		/*
5460 		 * Unbound case.
5461 		 *
5462 		 * Usually, this loop will only be executed once,
5463 		 * but if CLPC derecommends a processor after it has been chosen,
5464 		 * or if a processor is shut down after it is chosen,
5465 		 * choose_processor() may return NULL, so a retry
5466 		 * may be necessary.  A single retry will usually
5467 		 * be enough, and we can't afford to retry too many times
5468 		 * because interrupts are disabled.
5469 		 */
5470 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
5471 		for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5472 			processor_t processor_hint = PROCESSOR_NULL;
5473 			pset_node_t node = SCHED(choose_node)(thread);
5474 			processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
5475 
5476 			pset_lock(starting_pset);
5477 
5478 			processor = SCHED(choose_processor)(starting_pset, processor_hint, thread, &options);
5479 			if (processor != PROCESSOR_NULL) {
5480 				pset = processor->processor_set;
5481 				pset_assert_locked(pset);
5482 				break;
5483 			}
5484 		}
5485 		/*
5486 		 * If choose_processor() still returns NULL,
5487 		 * which is very unlikely, we need a fallback.
5488 		 */
5489 		if (processor == PROCESSOR_NULL) {
5490 			bool unlock_available_cores_lock = false;
5491 			if (sched_all_cpus_offline()) {
5492 				/*
5493 				 * There are no available processors
5494 				 * because we're in final system shutdown.
5495 				 * Enqueue on the master processor and we'll
5496 				 * handle it when it powers back up.
5497 				 */
5498 				processor = master_processor;
5499 			} else if (support_bootcpu_shutdown) {
5500 				/*
5501 				 * Grab the sched_available_cores_lock to select
5502 				 * some available processor and prevent it from
5503 				 * becoming offline while we enqueue the thread.
5504 				 */
5505 				simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
5506 				unlock_available_cores_lock = true;
5507 
5508 				int last_resort_cpu = sched_last_resort_cpu();
5509 
5510 				processor = processor_array[last_resort_cpu];
5511 			} else {
5512 				/*
5513 				 * The master processor is never shut down, always safe to choose.
5514 				 */
5515 				processor = master_processor;
5516 			}
5517 			pset = processor->processor_set;
5518 			pset_lock(pset);
5519 			assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5520 			if (unlock_available_cores_lock) {
5521 				simple_unlock(&sched_available_cores_lock);
5522 			}
5523 		}
5524 		task_t task = get_threadtask(thread);
5525 		if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5526 			task->pset_hint = pset; /* NRG this is done without holding the task lock */
5527 		}
5528 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5529 		    (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5530 		assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5531 	} else {
5532 		/*
5533 		 *	Bound case:
5534 		 *
5535 		 *	Unconditionally dispatch on the processor.
5536 		 */
5537 		processor = thread->bound_processor;
5538 		pset = processor->processor_set;
5539 		pset_lock(pset);
5540 
5541 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5542 		    (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5543 	}
5544 
5545 	/*
5546 	 *	Dispatch the thread on the chosen processor.
5547 	 *	TODO: This should be based on sched_mode, not sched_pri
5548 	 */
5549 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5550 		realtime_setrun(processor, thread);
5551 	} else {
5552 		processor_setrun(processor, thread, options);
5553 	}
5554 	/* pset is now unlocked */
5555 	if (thread->bound_processor == PROCESSOR_NULL) {
5556 		SCHED(check_spill)(pset, thread);
5557 	}
5558 }
5559 
5560 processor_set_t
task_choose_pset(task_t task)5561 task_choose_pset(
5562 	task_t          task)
5563 {
5564 	processor_set_t         pset = task->pset_hint;
5565 
5566 	if (pset != PROCESSOR_SET_NULL) {
5567 		pset = choose_next_pset(pset);
5568 	}
5569 
5570 	return pset;
5571 }
5572 
5573 /*
5574  *	Check for a preemption point in
5575  *	the current context.
5576  *
5577  *	Called at splsched with thread locked.
5578  */
5579 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)5580 csw_check(
5581 	thread_t                thread,
5582 	processor_t             processor,
5583 	ast_t                   check_reason)
5584 {
5585 	processor_set_t pset = processor->processor_set;
5586 
5587 	assert(thread == processor->active_thread);
5588 
5589 	pset_lock(pset);
5590 
5591 	processor_state_update_from_thread(processor, thread, true);
5592 
5593 	ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5594 
5595 	/* Acknowledge the IPI if we decided not to preempt */
5596 
5597 	if ((preempt & AST_URGENT) == 0) {
5598 		if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5599 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5600 		}
5601 	}
5602 
5603 	if ((preempt & AST_PREEMPT) == 0) {
5604 		bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5605 	}
5606 
5607 	pset_unlock(pset);
5608 
5609 	return update_pending_nonurgent_preemption(processor, preempt);
5610 }
5611 
5612 void
clear_pending_nonurgent_preemption(processor_t processor)5613 clear_pending_nonurgent_preemption(processor_t processor)
5614 {
5615 	if (!processor->pending_nonurgent_preemption) {
5616 		return;
5617 	}
5618 
5619 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_END);
5620 
5621 	processor->pending_nonurgent_preemption = false;
5622 	running_timer_clear(processor, RUNNING_TIMER_PREEMPT);
5623 }
5624 
5625 ast_t
update_pending_nonurgent_preemption(processor_t processor,ast_t reason)5626 update_pending_nonurgent_preemption(processor_t processor, ast_t reason)
5627 {
5628 	if ((reason & (AST_URGENT | AST_PREEMPT)) != (AST_PREEMPT)) {
5629 		clear_pending_nonurgent_preemption(processor);
5630 		return reason;
5631 	}
5632 
5633 	if (nonurgent_preemption_timer_abs == 0) {
5634 		/* Preemption timer not enabled */
5635 		return reason;
5636 	}
5637 
5638 	if (current_thread()->state & TH_IDLE) {
5639 		/* idle threads don't need nonurgent preemption */
5640 		return reason;
5641 	}
5642 
5643 	if (processor->pending_nonurgent_preemption) {
5644 		/* Timer is already armed, no need to do it again */
5645 		return reason;
5646 	}
5647 
5648 	if (ml_did_interrupt_userspace()) {
5649 		/*
5650 		 * We're preempting userspace here, so we don't need
5651 		 * to defer the preemption.  Force AST_URGENT
5652 		 * so that we can avoid arming this timer without risking
5653 		 * ast_taken_user deciding to spend too long in kernel
5654 		 * space to handle other ASTs.
5655 		 */
5656 
5657 		return reason | AST_URGENT;
5658 	}
5659 
5660 	/*
5661 	 * We've decided to do a nonurgent preemption when running in
5662 	 * kernelspace. We defer the preemption until reaching userspace boundary
5663 	 * to give a grace period for locks etc to be dropped and to reach
5664 	 * a clean preemption point, so that the preempting thread doesn't
5665 	 * always immediately hit the lock that the waking thread still holds.
5666 	 *
5667 	 * Arm a timer to enforce that the preemption executes within a bounded
5668 	 * time if the thread doesn't block or return to userspace quickly.
5669 	 */
5670 
5671 	processor->pending_nonurgent_preemption = true;
5672 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_START,
5673 	    reason);
5674 
5675 	uint64_t now = mach_absolute_time();
5676 
5677 	uint64_t deadline = now + nonurgent_preemption_timer_abs;
5678 
5679 	running_timer_enter(processor, RUNNING_TIMER_PREEMPT, NULL,
5680 	    deadline, now);
5681 
5682 	return reason;
5683 }
5684 
5685 /*
5686  * Check for preemption at splsched with
5687  * pset locked and processor as the current
5688  * processor.
5689  */
5690 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)5691 csw_check_locked(
5692 	thread_t                thread,
5693 	processor_t             processor,
5694 	processor_set_t         pset,
5695 	ast_t                   check_reason)
5696 {
5697 	assert(processor == current_processor());
5698 	/*
5699 	 * If the current thread is running on a processor that is no longer recommended,
5700 	 * urgently preempt it, at which point thread_select() should
5701 	 * try to idle the processor and re-dispatch the thread to a recommended processor.
5702 	 */
5703 	if (!processor->is_recommended) {
5704 		return check_reason | AST_PREEMPT | AST_URGENT;
5705 	}
5706 
5707 	if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5708 		return check_reason | AST_PREEMPT | AST_URGENT;
5709 	}
5710 
5711 	if (rt_runq_count(pset) > 0) {
5712 		if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5713 			return check_reason | AST_PREEMPT | AST_URGENT;
5714 		} else if (rt_deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) {
5715 			return check_reason | AST_PREEMPT | AST_URGENT;
5716 		} else {
5717 			return check_reason | AST_PREEMPT;
5718 		}
5719 	}
5720 
5721 	ast_t result = SCHED(processor_csw_check)(processor);
5722 	if (result != AST_NONE) {
5723 		return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
5724 	}
5725 
5726 	/*
5727 	 * Same for avoid-processor
5728 	 *
5729 	 * TODO: Should these set AST_REBALANCE?
5730 	 */
5731 	if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, check_reason)) {
5732 		return check_reason | AST_PREEMPT;
5733 	}
5734 
5735 #if CONFIG_SCHED_SMT
5736 	/*
5737 	 * Even though we could continue executing on this processor, a
5738 	 * secondary SMT core should try to shed load to another primary core.
5739 	 *
5740 	 * TODO: Should this do the same check that thread_select does? i.e.
5741 	 * if no bound threads target this processor, and idle primaries exist, preempt
5742 	 * The case of RT threads existing is already taken care of above
5743 	 */
5744 
5745 	if (processor->current_pri < BASEPRI_RTQUEUES &&
5746 	    processor->processor_primary != processor) {
5747 		return check_reason | AST_PREEMPT;
5748 	}
5749 #endif /* CONFIG_SCHED_SMT*/
5750 
5751 	if (thread->state & TH_SUSP) {
5752 		return check_reason | AST_PREEMPT;
5753 	}
5754 
5755 #if CONFIG_SCHED_SFI
5756 	/*
5757 	 * Current thread may not need to be preempted, but maybe needs
5758 	 * an SFI wait?
5759 	 */
5760 	result = sfi_thread_needs_ast(thread, NULL);
5761 	if (result != AST_NONE) {
5762 		return result;
5763 	}
5764 #endif
5765 
5766 	return AST_NONE;
5767 }
5768 
5769 /*
5770  * Handle preemption IPI or IPI in response to setting an AST flag
5771  * Triggered by cause_ast_check
5772  * Called at splsched
5773  */
5774 void
ast_check(processor_t processor)5775 ast_check(processor_t processor)
5776 {
5777 	smr_ack_ipi();
5778 
5779 	if (processor->state != PROCESSOR_RUNNING) {
5780 		return;
5781 	}
5782 
5783 	SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
5784 	    MACH_SCHED_AST_CHECK) | DBG_FUNC_START);
5785 
5786 	thread_t thread = processor->active_thread;
5787 
5788 	assert(thread == current_thread());
5789 
5790 	/*
5791 	 * Pairs with task_restartable_ranges_synchronize
5792 	 */
5793 	thread_lock(thread);
5794 
5795 	thread_reset_pcs_ack_IPI(thread);
5796 
5797 	/*
5798 	 * Propagate thread ast to processor.
5799 	 * (handles IPI in response to setting AST flag)
5800 	 */
5801 	ast_propagate(thread);
5802 
5803 	/*
5804 	 * Stash the old urgency and perfctl values to find out if
5805 	 * csw_check updates them.
5806 	 */
5807 	thread_urgency_t old_urgency = processor->current_urgency;
5808 	perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
5809 
5810 	ast_t preempt;
5811 
5812 	if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5813 		ast_on(preempt);
5814 	}
5815 
5816 	if (old_urgency != processor->current_urgency) {
5817 		/*
5818 		 * Urgency updates happen with the thread lock held (ugh).
5819 		 * TODO: This doesn't notice QoS changes...
5820 		 */
5821 		uint64_t urgency_param1, urgency_param2;
5822 
5823 		thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
5824 		thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
5825 	}
5826 
5827 	thread_unlock(thread);
5828 
5829 	if (old_perfctl_class != processor->current_perfctl_class) {
5830 		/*
5831 		 * We updated the perfctl class of this thread from another core.
5832 		 * Let CLPC know that the currently running thread has a new
5833 		 * class.
5834 		 */
5835 
5836 		machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
5837 		    mach_approximate_time(), 0, thread);
5838 	}
5839 
5840 	SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
5841 	    MACH_SCHED_AST_CHECK) | DBG_FUNC_END, preempt);
5842 }
5843 
5844 
5845 void
thread_preempt_expire(timer_call_param_t p0,__unused timer_call_param_t p1)5846 thread_preempt_expire(
5847 	timer_call_param_t      p0,
5848 	__unused timer_call_param_t      p1)
5849 {
5850 	processor_t processor = p0;
5851 
5852 	assert(processor == current_processor());
5853 	assert(p1 == NULL);
5854 
5855 	thread_t thread = current_thread();
5856 
5857 	/*
5858 	 * This is set and cleared by the current core, so we will
5859 	 * never see a race with running timer expiration
5860 	 */
5861 	assert(processor->pending_nonurgent_preemption);
5862 
5863 	clear_pending_nonurgent_preemption(processor);
5864 
5865 	thread_lock(thread);
5866 
5867 	/*
5868 	 * Check again to see if it's still worth a
5869 	 * context switch, but this time force enable kernel preemption
5870 	 */
5871 
5872 	ast_t preempt = csw_check(thread, processor, AST_URGENT);
5873 
5874 	if (preempt) {
5875 		ast_on(preempt);
5876 	}
5877 
5878 	thread_unlock(thread);
5879 
5880 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt);
5881 }
5882 
5883 void
perfcontrol_timer_expire(timer_call_param_t p0,__unused timer_call_param_t p1)5884 perfcontrol_timer_expire(
5885 	timer_call_param_t          p0,
5886 	__unused timer_call_param_t p1
5887 	)
5888 {
5889 	processor_t processor = p0;
5890 	uint64_t now = mach_absolute_time();
5891 	/* Default behavior is to cancel the timer */
5892 	uint64_t timeout_ticks = EndOfAllTime;
5893 	machine_perfcontrol_running_timer_expire(now, 0, processor->cpu_id, &timeout_ticks);
5894 	if (timeout_ticks == EndOfAllTime) {
5895 		running_timer_clear(processor, RUNNING_TIMER_PERFCONTROL);
5896 	} else {
5897 		uint64_t deadline = now + timeout_ticks;
5898 		running_timer_setup(processor, RUNNING_TIMER_PERFCONTROL, NULL, deadline, now);
5899 	}
5900 }
5901 
5902 /*
5903  *	set_sched_pri:
5904  *
5905  *	Set the scheduled priority of the specified thread.
5906  *
5907  *	This may cause the thread to change queues.
5908  *
5909  *	Thread must be locked.
5910  */
5911 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)5912 set_sched_pri(
5913 	thread_t        thread,
5914 	int16_t         new_priority,
5915 	set_sched_pri_options_t options)
5916 {
5917 	bool is_current_thread = (thread == current_thread());
5918 	bool removed_from_runq = false;
5919 	bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
5920 
5921 	int16_t old_priority = thread->sched_pri;
5922 
5923 	/* If we're already at this priority, no need to mess with the runqueue */
5924 	if (new_priority == old_priority) {
5925 #if CONFIG_SCHED_CLUTCH
5926 		/* For the first thread in the system, the priority is correct but
5927 		 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
5928 		 * scheduler relies on the bucket being set for all threads, update
5929 		 * its bucket here.
5930 		 */
5931 		if (thread->th_sched_bucket == TH_BUCKET_RUN) {
5932 			assert(thread == vm_pageout_scan_thread);
5933 			SCHED(update_thread_bucket)(thread);
5934 		}
5935 #endif /* CONFIG_SCHED_CLUTCH */
5936 
5937 		return;
5938 	}
5939 
5940 	if (is_current_thread) {
5941 		assert(thread->state & TH_RUN);
5942 		thread_assert_runq_null(thread);
5943 	} else {
5944 		removed_from_runq = thread_run_queue_remove(thread);
5945 	}
5946 
5947 	thread->sched_pri = new_priority;
5948 
5949 #if CONFIG_SCHED_CLUTCH
5950 	/*
5951 	 * Since for the clutch scheduler, the thread's bucket determines its runq
5952 	 * in the hierarchy, it is important to update the bucket when the thread
5953 	 * lock is held and the thread has been removed from the runq hierarchy.
5954 	 *
5955 	 * If the thread's bucket has changed, this will consume sched_tick_delta()
5956 	 * in order to account CPU time with the correct scheduling bucket.
5957 	 */
5958 	SCHED(update_thread_bucket)(thread);
5959 
5960 #endif /* CONFIG_SCHED_CLUTCH */
5961 
5962 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
5963 	    (uintptr_t)thread_tid(thread),
5964 	    thread->base_pri,
5965 	    thread->sched_pri,
5966 	    thread->sched_usage,
5967 	    0);
5968 
5969 	if (removed_from_runq) {
5970 		thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
5971 	} else if (is_current_thread) {
5972 		processor_t processor = thread->last_processor;
5973 		assert(processor == current_processor());
5974 
5975 		thread_urgency_t old_urgency = processor->current_urgency;
5976 
5977 		/*
5978 		 * When dropping in priority, check if the thread no longer belongs on core.
5979 		 * If a thread raises its own priority, don't aggressively rebalance it.
5980 		 * <rdar://problem/31699165>
5981 		 *
5982 		 * csw_check does a processor_state_update_from_thread, but
5983 		 * we should do our own if we're being lazy.
5984 		 */
5985 		if (!lazy_update && new_priority < old_priority) {
5986 			ast_t preempt;
5987 
5988 			if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5989 				ast_on(preempt);
5990 			}
5991 		} else {
5992 			processor_state_update_from_thread(processor, thread, false);
5993 		}
5994 
5995 		/*
5996 		 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
5997 		 * class alterations from user space to occur relatively infrequently, hence
5998 		 * those are lazily handled. QoS classes have distinct priority bands, and QoS
5999 		 * inheritance is expected to involve priority changes.
6000 		 */
6001 		if (processor->current_urgency != old_urgency) {
6002 			uint64_t urgency_param1, urgency_param2;
6003 
6004 			thread_urgency_t new_urgency = thread_get_urgency(thread,
6005 			    &urgency_param1, &urgency_param2);
6006 
6007 			thread_tell_urgency(new_urgency, urgency_param1,
6008 			    urgency_param2, 0, thread);
6009 		}
6010 
6011 		/* TODO: only call this if current_perfctl_class changed */
6012 		uint64_t ctime = mach_approximate_time();
6013 		machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
6014 	} else if (thread->state & TH_RUN) {
6015 		processor_t processor = thread->last_processor;
6016 
6017 		if (!lazy_update &&
6018 		    processor != PROCESSOR_NULL &&
6019 		    processor != current_processor() &&
6020 		    processor->active_thread == thread) {
6021 			cause_ast_check(processor);
6022 		}
6023 	}
6024 }
6025 
6026 /*
6027  * thread_run_queue_remove_for_handoff
6028  *
6029  * Pull a thread or its (recursive) push target out of the runqueue
6030  * so that it is ready for thread_run()
6031  *
6032  * Called at splsched
6033  *
6034  * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6035  * This may be different than the thread that was passed in.
6036  */
6037 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)6038 thread_run_queue_remove_for_handoff(thread_t thread)
6039 {
6040 	thread_t pulled_thread = THREAD_NULL;
6041 
6042 	thread_lock(thread);
6043 
6044 	/*
6045 	 * Check that the thread is not bound to a different processor,
6046 	 * NO_SMT flag is not set on the thread, cluster type of
6047 	 * processor matches with thread if the thread is pinned to a
6048 	 * particular cluster and that realtime is not involved.
6049 	 *
6050 	 * Next, pull it off its run queue.  If it doesn't come, it's not eligible.
6051 	 */
6052 	processor_t processor = current_processor();
6053 	if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6054 #if CONFIG_SCHED_SMT
6055 	    && (!thread_no_smt(thread))
6056 #endif /* CONFIG_SCHED_SMT */
6057 	    && (processor->current_pri < BASEPRI_RTQUEUES)
6058 	    && (thread->sched_pri < BASEPRI_RTQUEUES)
6059 #if __AMP__
6060 	    && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6061 	    processor->processor_set->pset_id == thread->th_bound_cluster_id)
6062 #endif /* __AMP__ */
6063 	    ) {
6064 		if (thread_run_queue_remove(thread)) {
6065 			pulled_thread = thread;
6066 		}
6067 	}
6068 
6069 	thread_unlock(thread);
6070 
6071 	return pulled_thread;
6072 }
6073 
6074 /*
6075  * thread_prepare_for_handoff
6076  *
6077  * Make the thread ready for handoff.
6078  * If the thread was runnable then pull it off the runq, if the thread could
6079  * not be pulled, return NULL.
6080  *
6081  * If the thread was woken up from wait for handoff, make sure it is not bound to
6082  * different processor.
6083  *
6084  * Called at splsched
6085  *
6086  * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6087  * This may be different than the thread that was passed in.
6088  */
6089 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)6090 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6091 {
6092 	thread_t pulled_thread = THREAD_NULL;
6093 
6094 	if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6095 		processor_t processor = current_processor();
6096 		thread_lock(thread);
6097 
6098 		/*
6099 		 * Check that the thread is not bound to a different processor,
6100 		 * NO_SMT flag is not set on the thread and cluster type of
6101 		 * processor matches with thread if the thread is pinned to a
6102 		 * particular cluster. Call setrun instead if above conditions
6103 		 * are not satisfied.
6104 		 */
6105 		if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6106 #if CONFIG_SCHED_SMT
6107 		    && (!thread_no_smt(thread))
6108 #endif /* CONFIG_SCHED_SMT */
6109 #if __AMP__
6110 		    && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6111 		    processor->processor_set->pset_id == thread->th_bound_cluster_id)
6112 #endif /* __AMP__ */
6113 		    ) {
6114 			pulled_thread = thread;
6115 		} else {
6116 			thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
6117 		}
6118 		thread_unlock(thread);
6119 	} else {
6120 		pulled_thread = thread_run_queue_remove_for_handoff(thread);
6121 	}
6122 
6123 	return pulled_thread;
6124 }
6125 
6126 /*
6127  *	thread_run_queue_remove:
6128  *
6129  *	Remove a thread from its current run queue and
6130  *	return TRUE if successful.
6131  *
6132  *	Thread must be locked.
6133  *
6134  *	If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6135  *	run queues because the caller locked the thread.  Otherwise
6136  *	the thread is on a run queue, but could be chosen for dispatch
6137  *	and removed by another processor under a different lock, which
6138  *	will set thread->runq to PROCESSOR_NULL.
6139  *
6140  *	Hence the thread select path must not rely on anything that could
6141  *	be changed under the thread lock after calling this function,
6142  *	most importantly thread->sched_pri.
6143  */
6144 boolean_t
thread_run_queue_remove(thread_t thread)6145 thread_run_queue_remove(
6146 	thread_t        thread)
6147 {
6148 	boolean_t removed = FALSE;
6149 
6150 	if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6151 		/* Thread isn't runnable */
6152 		thread_assert_runq_null(thread);
6153 		return FALSE;
6154 	}
6155 
6156 	processor_t processor = thread_get_runq(thread);
6157 	if (processor == PROCESSOR_NULL) {
6158 		/*
6159 		 * The thread is either not on the runq,
6160 		 * or is in the midst of being removed from the runq.
6161 		 *
6162 		 * runq is set to NULL under the pset lock, not the thread
6163 		 * lock, so the thread may still be in the process of being dequeued
6164 		 * from the runq. It will wait in invoke for the thread lock to be
6165 		 * dropped.
6166 		 */
6167 
6168 		return FALSE;
6169 	}
6170 
6171 	if (thread->sched_pri < BASEPRI_RTQUEUES) {
6172 		return SCHED(processor_queue_remove)(processor, thread);
6173 	}
6174 
6175 	processor_set_t pset = processor->processor_set;
6176 
6177 	pset_lock(pset);
6178 
6179 	/*
6180 	 * Must re-read the thread runq after acquiring the pset lock, in
6181 	 * case another core swooped in before us to dequeue the thread.
6182 	 */
6183 	if (thread_get_runq_locked(thread) != PROCESSOR_NULL) {
6184 		/*
6185 		 *	Thread is on the RT run queue and we have a lock on
6186 		 *	that run queue.
6187 		 */
6188 		rt_runq_remove(&pset->rt_runq, thread);
6189 		pset_update_rt_stealable_state(pset);
6190 
6191 		removed = TRUE;
6192 	}
6193 
6194 	pset_unlock(pset);
6195 
6196 	return removed;
6197 }
6198 
6199 /*
6200  * Put the thread back where it goes after a thread_run_queue_remove
6201  *
6202  * Thread must have been removed under the same thread lock hold
6203  *
6204  * thread locked, at splsched
6205  */
6206 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)6207 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6208 {
6209 	thread_assert_runq_null(thread);
6210 	assert(thread->state & (TH_RUN));
6211 
6212 	thread_setrun(thread, options);
6213 }
6214 
6215 void
sys_override_cpu_throttle(boolean_t enable_override)6216 sys_override_cpu_throttle(boolean_t enable_override)
6217 {
6218 	if (enable_override) {
6219 		cpu_throttle_enabled = 0;
6220 	} else {
6221 		cpu_throttle_enabled = 1;
6222 	}
6223 }
6224 
6225 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)6226 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6227 {
6228 	uint64_t urgency_param1 = 0, urgency_param2 = 0;
6229 	task_t task = get_threadtask_early(thread);
6230 
6231 	thread_urgency_t urgency;
6232 
6233 	if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6234 		urgency_param1 = 0;
6235 		urgency_param2 = 0;
6236 
6237 		urgency = THREAD_URGENCY_NONE;
6238 	} else if (thread->sched_mode == TH_MODE_REALTIME) {
6239 		urgency_param1 = thread->realtime.period;
6240 		urgency_param2 = thread->realtime.deadline;
6241 
6242 		urgency = THREAD_URGENCY_REAL_TIME;
6243 	} else if (cpu_throttle_enabled &&
6244 	    (thread->sched_pri <= MAXPRI_THROTTLE) &&
6245 	    (thread->base_pri <= MAXPRI_THROTTLE)) {
6246 		/*
6247 		 * Threads that are running at low priority but are not
6248 		 * tagged with a specific QoS are separated out from
6249 		 * the "background" urgency. Performance management
6250 		 * subsystem can decide to either treat these threads
6251 		 * as normal threads or look at other signals like thermal
6252 		 * levels for optimal power/perf tradeoffs for a platform.
6253 		 */
6254 		boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6255 		boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6256 
6257 		/*
6258 		 * Background urgency applied when thread priority is
6259 		 * MAXPRI_THROTTLE or lower and thread is not promoted
6260 		 * and thread has a QoS specified
6261 		 */
6262 		urgency_param1 = thread->sched_pri;
6263 		urgency_param2 = thread->base_pri;
6264 
6265 		if (thread_lacks_qos && !task_is_suppressed) {
6266 			urgency = THREAD_URGENCY_LOWPRI;
6267 		} else {
6268 			urgency = THREAD_URGENCY_BACKGROUND;
6269 		}
6270 	} else {
6271 		/* For otherwise unclassified threads, report throughput QoS parameters */
6272 		urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6273 		urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6274 		urgency = THREAD_URGENCY_NORMAL;
6275 	}
6276 
6277 	if (arg1 != NULL) {
6278 		*arg1 = urgency_param1;
6279 	}
6280 	if (arg2 != NULL) {
6281 		*arg2 = urgency_param2;
6282 	}
6283 
6284 	return urgency;
6285 }
6286 
6287 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)6288 thread_get_perfcontrol_class(thread_t thread)
6289 {
6290 	/* Special case handling */
6291 	if (thread->state & TH_IDLE) {
6292 		return PERFCONTROL_CLASS_IDLE;
6293 	}
6294 
6295 	if (thread->sched_mode == TH_MODE_REALTIME) {
6296 		return PERFCONTROL_CLASS_REALTIME;
6297 	}
6298 
6299 	/* perfcontrol_class based on base_pri */
6300 	if (thread->base_pri <= MAXPRI_THROTTLE) {
6301 		return PERFCONTROL_CLASS_BACKGROUND;
6302 	} else if (thread->base_pri <= BASEPRI_UTILITY) {
6303 		return PERFCONTROL_CLASS_UTILITY;
6304 	} else if (thread->base_pri <= BASEPRI_DEFAULT) {
6305 		return PERFCONTROL_CLASS_NONUI;
6306 	} else if (thread->base_pri <= BASEPRI_USER_INITIATED) {
6307 		return PERFCONTROL_CLASS_USER_INITIATED;
6308 	} else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6309 		return PERFCONTROL_CLASS_UI;
6310 	} else {
6311 		if (get_threadtask(thread) == kernel_task) {
6312 			/*
6313 			 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6314 			 * All other lower priority kernel threads should be treated
6315 			 * as regular threads for performance control purposes.
6316 			 */
6317 			return PERFCONTROL_CLASS_KERNEL;
6318 		}
6319 		return PERFCONTROL_CLASS_ABOVEUI;
6320 	}
6321 }
6322 
6323 /*
6324  *	This is the processor idle loop, which just looks for other threads
6325  *	to execute.  Processor idle threads invoke this without supplying a
6326  *	current thread to idle without an asserted wait state.
6327  *
6328  *	Returns a the next thread to execute if dispatched directly.
6329  */
6330 
6331 #if 0
6332 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6333 #else
6334 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6335 #endif
6336 
6337 #if (DEVELOPMENT || DEBUG)
6338 int sched_idle_delay_cpuid = -1;
6339 #endif
6340 
6341 thread_t
processor_idle(thread_t thread,processor_t processor)6342 processor_idle(
6343 	thread_t                        thread,
6344 	processor_t                     processor)
6345 {
6346 	processor_set_t         pset = processor->processor_set;
6347 	struct recount_snap snap = { 0 };
6348 
6349 	(void)splsched();
6350 
6351 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6352 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6353 	    (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6354 
6355 	SCHED_STATS_INC(idle_transitions);
6356 	assert(processor->running_timers_active == false);
6357 
6358 	recount_snapshot(&snap);
6359 	recount_processor_idle(&processor->pr_recount, &snap);
6360 
6361 	while (1) {
6362 		/*
6363 		 * Ensure that updates to my processor and pset state,
6364 		 * made by the IPI source processor before sending the IPI,
6365 		 * are visible on this processor now (even though we don't
6366 		 * take the pset lock yet).
6367 		 */
6368 		atomic_thread_fence(memory_order_acquire);
6369 
6370 		if (processor->state != PROCESSOR_IDLE) {
6371 			break;
6372 		}
6373 		if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6374 			break;
6375 		}
6376 #if defined(CONFIG_SCHED_DEFERRED_AST)
6377 		if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6378 			break;
6379 		}
6380 #endif
6381 		if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6382 			break;
6383 		}
6384 
6385 		if (
6386 			processor->is_recommended
6387 #if CONFIG_SCHED_SMT
6388 			&& (processor->processor_primary == processor)
6389 #endif /* CONFIG_SCHED_SMT */
6390 			) {
6391 			if (rt_runq_count(pset)) {
6392 				break;
6393 			}
6394 		} else {
6395 			if (SCHED(processor_bound_count)(processor)) {
6396 				break;
6397 			}
6398 		}
6399 
6400 		IDLE_KERNEL_DEBUG_CONSTANT(
6401 			MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6402 
6403 		machine_track_platform_idle(TRUE);
6404 
6405 		machine_idle();
6406 		/* returns with interrupts enabled */
6407 
6408 		machine_track_platform_idle(FALSE);
6409 
6410 #if (DEVELOPMENT || DEBUG)
6411 		if (processor->cpu_id == sched_idle_delay_cpuid) {
6412 			delay(500);
6413 		}
6414 #endif
6415 
6416 		(void)splsched();
6417 
6418 		atomic_thread_fence(memory_order_acquire);
6419 
6420 		IDLE_KERNEL_DEBUG_CONSTANT(
6421 			MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6422 
6423 		/*
6424 		 * Check if we should call sched_timeshare_consider_maintenance() here.
6425 		 * The CPU was woken out of idle due to an interrupt and we should do the
6426 		 * call only if the processor is still idle. If the processor is non-idle,
6427 		 * the threads running on the processor would do the call as part of
6428 		 * context swithing.
6429 		 */
6430 		if (processor->state == PROCESSOR_IDLE) {
6431 			sched_timeshare_consider_maintenance(mach_absolute_time(), true);
6432 		}
6433 
6434 		if (!SCHED(processor_queue_empty)(processor)) {
6435 #if CONFIG_SCHED_SMT
6436 			/* Secondary SMT processors respond to directed wakeups
6437 			 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6438 			 */
6439 			if (processor->processor_primary == processor) {
6440 				break;
6441 			}
6442 #else /* CONFIG_SCHED_SMT*/
6443 			break;
6444 #endif /* CONFIG_SCHED_SMT*/
6445 		}
6446 	}
6447 
6448 	recount_snapshot(&snap);
6449 	recount_processor_run(&processor->pr_recount, &snap);
6450 	smr_cpu_join(processor, snap.rsn_time_mach);
6451 
6452 	ast_t reason = AST_NONE;
6453 
6454 	/* We're handling all scheduling AST's */
6455 	ast_off(AST_SCHEDULING);
6456 
6457 	/*
6458 	 * thread_select will move the processor from dispatching to running,
6459 	 * or put it in idle if there's nothing to do.
6460 	 */
6461 	thread_t cur_thread = current_thread();
6462 
6463 	thread_lock(cur_thread);
6464 	thread_t new_thread = thread_select(cur_thread, processor, &reason);
6465 	thread_unlock(cur_thread);
6466 
6467 	assert(processor->running_timers_active == false);
6468 
6469 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6470 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6471 	    (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6472 
6473 	return new_thread;
6474 }
6475 
6476 /*
6477  *	Each processor has a dedicated thread which
6478  *	executes the idle loop when there is no suitable
6479  *	previous context.
6480  *
6481  *	This continuation is entered with interrupts disabled.
6482  */
6483 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)6484 idle_thread(__assert_only void* parameter,
6485     __unused wait_result_t result)
6486 {
6487 	assert(ml_get_interrupts_enabled() == FALSE);
6488 	assert(parameter == NULL);
6489 
6490 	processor_t processor = current_processor();
6491 
6492 	smr_cpu_leave(processor, processor->last_dispatch);
6493 
6494 	/*
6495 	 * Ensure that anything running in idle context triggers
6496 	 * preemption-disabled checks.
6497 	 */
6498 	disable_preemption_without_measurements();
6499 
6500 	/*
6501 	 * Enable interrupts temporarily to handle any pending interrupts
6502 	 * or IPIs before deciding to sleep
6503 	 */
6504 	spllo();
6505 
6506 	thread_t new_thread = processor_idle(THREAD_NULL, processor);
6507 	/* returns with interrupts disabled */
6508 
6509 	enable_preemption();
6510 
6511 	if (new_thread != THREAD_NULL) {
6512 		thread_run(processor->idle_thread,
6513 		    idle_thread, NULL, new_thread);
6514 		/*NOTREACHED*/
6515 	}
6516 
6517 	thread_block(idle_thread);
6518 	/*NOTREACHED*/
6519 }
6520 
6521 void
idle_thread_create(processor_t processor,thread_continue_t continuation)6522 idle_thread_create(
6523 	processor_t             processor,
6524 	thread_continue_t       continuation)
6525 {
6526 	kern_return_t   result;
6527 	thread_t                thread;
6528 	spl_t                   s;
6529 	char                    name[MAXTHREADNAMESIZE];
6530 
6531 	result = kernel_thread_create(continuation, NULL, MAXPRI_KERNEL, &thread);
6532 	if (result != KERN_SUCCESS) {
6533 		panic("idle_thread_create failed: %d", result);
6534 	}
6535 
6536 	snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6537 	thread_set_thread_name(thread, name);
6538 
6539 	s = splsched();
6540 	thread_lock(thread);
6541 	thread->bound_processor = processor;
6542 	thread->chosen_processor = processor;
6543 	processor->idle_thread = thread;
6544 	thread->sched_pri = thread->base_pri = IDLEPRI;
6545 	thread->state = (TH_RUN | TH_IDLE);
6546 	thread->options |= TH_OPT_IDLE_THREAD;
6547 	thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6548 	thread_unlock(thread);
6549 	splx(s);
6550 
6551 	thread_deallocate(thread);
6552 }
6553 
6554 /*
6555  * sched_startup:
6556  *
6557  * Kicks off scheduler services.
6558  *
6559  * Called at splsched.
6560  */
6561 void
sched_startup(void)6562 sched_startup(void)
6563 {
6564 	kern_return_t   result;
6565 	thread_t                thread;
6566 
6567 	simple_lock_init(&sched_vm_group_list_lock, 0);
6568 
6569 	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
6570 	    NULL, MAXPRI_KERNEL, &thread);
6571 	if (result != KERN_SUCCESS) {
6572 		panic("sched_startup");
6573 	}
6574 
6575 	thread_deallocate(thread);
6576 
6577 	assert_thread_magic(thread);
6578 
6579 	/*
6580 	 * Yield to the sched_init_thread once, to
6581 	 * initialize our own thread after being switched
6582 	 * back to.
6583 	 *
6584 	 * The current thread is the only other thread
6585 	 * active at this point.
6586 	 */
6587 	thread_block(THREAD_CONTINUE_NULL);
6588 
6589 	assert_thread_magic(thread);
6590 }
6591 
6592 #if __arm64__
6593 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6594 #endif /* __arm64__ */
6595 
6596 
6597 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6598 
6599 static _Atomic uint64_t                 sched_maintenance_deadline;
6600 /* Exclusively read/written by sched_timeshare_maintenance_continue */
6601 static uint64_t                         sched_tick_last_abstime;
6602 
6603 
6604 /*
6605  *	sched_init_thread:
6606  *
6607  *	Perform periodic bookkeeping functions about ten
6608  *	times per second.
6609  */
6610 void
sched_timeshare_maintenance_continue(void)6611 sched_timeshare_maintenance_continue(void)
6612 {
6613 	uint64_t        sched_tick_ctime, late_time, sched_tick_delta;
6614 
6615 	struct sched_update_scan_context scan_context = {
6616 		.earliest_bg_make_runnable_time = UINT64_MAX,
6617 		.earliest_normal_make_runnable_time = UINT64_MAX,
6618 		.earliest_rt_make_runnable_time = UINT64_MAX
6619 	};
6620 
6621 	sched_tick_ctime = mach_absolute_time();
6622 
6623 	if (__improbable(sched_tick_last_abstime == 0)) {
6624 		sched_tick_last_abstime = sched_tick_ctime;
6625 		late_time = 0;
6626 		sched_tick_delta = 1;
6627 	} else {
6628 		late_time = sched_tick_ctime - sched_tick_last_abstime;
6629 		sched_tick_delta = late_time / sched_tick_interval;
6630 		/* Ensure a delta of 1, since the interval could be slightly
6631 		 * smaller than the sched_tick_interval due to dispatch
6632 		 * latencies.
6633 		 */
6634 		sched_tick_delta = MAX(sched_tick_delta, 1);
6635 
6636 		/* In the event interrupt latencies or platform
6637 		 * idle events that advanced the timebase resulted
6638 		 * in periods where no threads were dispatched,
6639 		 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6640 		 * iterations.
6641 		 */
6642 		sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6643 
6644 		sched_tick_last_abstime = sched_tick_ctime;
6645 	}
6646 
6647 	scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6648 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6649 	    sched_tick_delta, late_time, 0, 0, 0);
6650 
6651 	/* Add a number of pseudo-ticks corresponding to the elapsed interval
6652 	 * This could be greater than 1 if substantial intervals where
6653 	 * all processors are idle occur, which rarely occurs in practice.
6654 	 */
6655 
6656 	os_atomic_add(&sched_tick, (uint32_t)sched_tick_delta, relaxed);
6657 
6658 	update_vm_info();
6659 
6660 	/*
6661 	 *  Compute various averages.
6662 	 */
6663 	compute_averages(sched_tick_delta);
6664 
6665 	/*
6666 	 *  Scan the run queues for threads which
6667 	 *  may need to be updated, and find the earliest runnable thread on the runqueue
6668 	 *  to report its latency.
6669 	 */
6670 	SCHED(thread_update_scan)(&scan_context);
6671 
6672 	/* rt_runq_scan also records pset bitmasks. */
6673 	SCHED(rt_runq_scan)(&scan_context);
6674 
6675 	uint64_t ctime = mach_absolute_time();
6676 
6677 	uint64_t bg_max_latency       = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6678 	    ctime - scan_context.earliest_bg_make_runnable_time : 0;
6679 
6680 	uint64_t default_max_latency  = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6681 	    ctime - scan_context.earliest_normal_make_runnable_time : 0;
6682 
6683 	uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6684 	    ctime - scan_context.earliest_rt_make_runnable_time : 0;
6685 
6686 	machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6687 
6688 	/*
6689 	 * Check to see if the special sched VM group needs attention.
6690 	 */
6691 	sched_vm_group_maintenance();
6692 
6693 #if __arm64__
6694 	/* Check to see if the recommended cores failsafe is active */
6695 	sched_recommended_cores_maintenance();
6696 #endif /* __arm64__ */
6697 
6698 
6699 #if DEBUG || DEVELOPMENT
6700 #if __x86_64__
6701 #include <i386/misc_protos.h>
6702 	/* Check for long-duration interrupts */
6703 	mp_interrupt_watchdog();
6704 #endif /* __x86_64__ */
6705 #endif /* DEBUG || DEVELOPMENT */
6706 
6707 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6708 	    sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6709 	    sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6710 
6711 	assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6712 	thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
6713 	/*NOTREACHED*/
6714 }
6715 
6716 static uint64_t sched_maintenance_wakeups;
6717 
6718 /*
6719  * Determine if the set of routines formerly driven by a maintenance timer
6720  * must be invoked, based on a deadline comparison. Signals the scheduler
6721  * maintenance thread on deadline expiration. Must be invoked at an interval
6722  * lower than the "sched_tick_interval", currently accomplished by
6723  * invocation via the quantum expiration timer and at context switch time.
6724  * Performance matters: this routine reuses a timestamp approximating the
6725  * current absolute time received from the caller, and should perform
6726  * no more than a comparison against the deadline in the common case.
6727  */
6728 void
sched_timeshare_consider_maintenance(uint64_t ctime,bool safe_point)6729 sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point)
6730 {
6731 	uint64_t deadline = os_atomic_load(&sched_maintenance_deadline, relaxed);
6732 
6733 	if (__improbable(ctime >= deadline)) {
6734 		if (__improbable(current_thread() == sched_maintenance_thread)) {
6735 			return;
6736 		}
6737 		OSMemoryBarrier();
6738 
6739 		uint64_t ndeadline = ctime + sched_tick_interval;
6740 
6741 		if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6742 			thread_wakeup((event_t)sched_timeshare_maintenance_continue);
6743 			sched_maintenance_wakeups++;
6744 			smr_maintenance(ctime);
6745 		}
6746 	}
6747 
6748 	smr_cpu_tick(ctime, safe_point);
6749 
6750 #if !CONFIG_SCHED_CLUTCH
6751 	/*
6752 	 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
6753 	 * scheduler, the load is maintained at the thread group and bucket level.
6754 	 */
6755 	uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
6756 
6757 	if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
6758 		uint64_t new_deadline = 0;
6759 		if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
6760 			compute_sched_load();
6761 			new_deadline = ctime + sched_load_compute_interval_abs;
6762 			os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
6763 		}
6764 	}
6765 #endif /* CONFIG_SCHED_CLUTCH */
6766 
6767 #if __arm64__
6768 	uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
6769 
6770 	if (__improbable(perf_deadline && ctime >= perf_deadline)) {
6771 		/* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
6772 		if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
6773 			machine_perfcontrol_deadline_passed(perf_deadline);
6774 		}
6775 	}
6776 #endif /* __arm64__ */
6777 }
6778 
6779 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6780 
6781 void
sched_init_thread(void)6782 sched_init_thread(void)
6783 {
6784 	thread_block(THREAD_CONTINUE_NULL);
6785 
6786 	thread_t thread = current_thread();
6787 
6788 	thread_set_thread_name(thread, "sched_maintenance_thread");
6789 
6790 	sched_maintenance_thread = thread;
6791 
6792 	SCHED(maintenance_continuation)();
6793 
6794 	/*NOTREACHED*/
6795 }
6796 
6797 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6798 
6799 /*
6800  *	thread_update_scan / runq_scan:
6801  *
6802  *	Scan the run queues to account for timesharing threads
6803  *	which need to be updated.
6804  *
6805  *	Scanner runs in two passes.  Pass one squirrels likely
6806  *	threads away in an array, pass two does the update.
6807  *
6808  *	This is necessary because the run queue is locked for
6809  *	the candidate scan, but	the thread is locked for the update.
6810  *
6811  *	Array should be sized to make forward progress, without
6812  *	disabling preemption for long periods.
6813  */
6814 
6815 #define THREAD_UPDATE_SIZE              128
6816 
6817 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
6818 static uint32_t thread_update_count = 0;
6819 
6820 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
6821 boolean_t
thread_update_add_thread(thread_t thread)6822 thread_update_add_thread(thread_t thread)
6823 {
6824 	if (thread_update_count == THREAD_UPDATE_SIZE) {
6825 		return FALSE;
6826 	}
6827 
6828 	thread_update_array[thread_update_count++] = thread;
6829 	thread_reference(thread);
6830 	return TRUE;
6831 }
6832 
6833 /* Returns whether the kernel should report that a thread triggered the fail-safe. */
6834 static bool
thread_should_report_failsafe(thread_t thread)6835 thread_should_report_failsafe(thread_t thread)
6836 {
6837 	if ((thread->sched_flags & TH_SFLAG_FAILSAFE) && !(thread->sched_flags & TH_SFLAG_FAILSAFE_REPORTED)) {
6838 		/* disarm the trigger for subsequent invocations */
6839 		thread->sched_flags |= TH_SFLAG_FAILSAFE_REPORTED;
6840 		return true;
6841 	}
6842 	return false;
6843 }
6844 
6845 void
thread_update_process_threads(void)6846 thread_update_process_threads(void)
6847 {
6848 	assert(thread_update_count <= THREAD_UPDATE_SIZE);
6849 
6850 	for (uint32_t i = 0; i < thread_update_count; i++) {
6851 		thread_t thread = thread_update_array[i];
6852 		assert_thread_magic(thread);
6853 		thread_update_array[i] = THREAD_NULL;
6854 
6855 		spl_t s = splsched();
6856 		thread_lock(thread);
6857 
6858 		const bool should_report_failsafe = thread_should_report_failsafe(thread);
6859 		const sched_mode_t saved_mode = thread->saved_mode; // if reporting
6860 
6861 		if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != os_atomic_load(&sched_tick, relaxed)) {
6862 			SCHED(update_priority)(thread);
6863 		}
6864 		thread_unlock(thread);
6865 		splx(s);
6866 
6867 		/* now that interrupts are enabled, it is safe to report fail-safe triggers */
6868 		if (should_report_failsafe) {
6869 			assert((saved_mode & TH_MODE_REALTIME) || (saved_mode & TH_MODE_FIXED));
6870 			uint64_t th_id = thread->thread_id;
6871 			char th_name[MAXTHREADNAMESIZE] = "unknown";
6872 			if (thread_has_thread_name(thread)) {
6873 				thread_get_thread_name(thread, th_name);
6874 			}
6875 			task_t task = get_threadtask(thread);
6876 			assert(task != NULL);
6877 			const char* t_name = task_best_name(task);
6878 			pid_t t_pid = task_pid(task);
6879 			const int quanta = (saved_mode & TH_MODE_REALTIME) ? max_unsafe_rt_quanta : max_unsafe_fixed_quanta;
6880 			const char* mode = (saved_mode & TH_MODE_REALTIME) ? "realtime" : "fixed";
6881 			os_log_error(OS_LOG_DEFAULT, "scheduler: thread %s [%llx] in "
6882 			    "process %s [%d] triggered fail-safe by spinning for at least %d"
6883 			    "us at %s priority\n",
6884 			    th_name,
6885 			    th_id,
6886 			    t_name,
6887 			    t_pid,
6888 			    quanta * (int) sched_get_quantum_us(),
6889 			    mode);
6890 		}
6891 
6892 		thread_deallocate(thread);
6893 	}
6894 
6895 	thread_update_count = 0;
6896 }
6897 
6898 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)6899 runq_scan_thread(
6900 	thread_t thread,
6901 	sched_update_scan_context_t scan_context)
6902 {
6903 	assert_thread_magic(thread);
6904 
6905 	if (thread->sched_stamp != os_atomic_load(&sched_tick, relaxed) &&
6906 	    thread->sched_mode == TH_MODE_TIMESHARE) {
6907 		if (thread_update_add_thread(thread) == FALSE) {
6908 			return TRUE;
6909 		}
6910 	}
6911 
6912 	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
6913 		if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
6914 			scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
6915 		}
6916 	} else {
6917 		if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
6918 			scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
6919 		}
6920 	}
6921 
6922 	return FALSE;
6923 }
6924 
6925 /*
6926  *	Scan a runq for candidate threads.
6927  *
6928  *	Returns TRUE if retry is needed.
6929  */
6930 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)6931 runq_scan(
6932 	run_queue_t                   runq,
6933 	sched_update_scan_context_t   scan_context)
6934 {
6935 	int count       = runq->count;
6936 	int queue_index;
6937 
6938 	assert(count >= 0);
6939 
6940 	if (count == 0) {
6941 		return FALSE;
6942 	}
6943 
6944 	for (queue_index = bitmap_first(runq->bitmap, NRQS);
6945 	    queue_index >= 0;
6946 	    queue_index = bitmap_next(runq->bitmap, queue_index)) {
6947 		thread_t thread;
6948 		circle_queue_t queue = &runq->queues[queue_index];
6949 
6950 		cqe_foreach_element(thread, queue, runq_links) {
6951 			assert(count > 0);
6952 			if (runq_scan_thread(thread, scan_context) == TRUE) {
6953 				return TRUE;
6954 			}
6955 			count--;
6956 		}
6957 	}
6958 
6959 	return FALSE;
6960 }
6961 
6962 #if CONFIG_SCHED_CLUTCH
6963 
6964 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)6965 sched_clutch_timeshare_scan(
6966 	queue_t thread_queue,
6967 	uint16_t thread_count,
6968 	sched_update_scan_context_t scan_context)
6969 {
6970 	if (thread_count == 0) {
6971 		return FALSE;
6972 	}
6973 
6974 	thread_t thread;
6975 	qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
6976 		if (runq_scan_thread(thread, scan_context) == TRUE) {
6977 			return TRUE;
6978 		}
6979 		thread_count--;
6980 	}
6981 
6982 	assert(thread_count == 0);
6983 	return FALSE;
6984 }
6985 
6986 
6987 #endif /* CONFIG_SCHED_CLUTCH */
6988 
6989 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6990 
6991 bool
thread_is_eager_preempt(thread_t thread)6992 thread_is_eager_preempt(thread_t thread)
6993 {
6994 	return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
6995 }
6996 
6997 void
thread_set_eager_preempt(thread_t thread)6998 thread_set_eager_preempt(thread_t thread)
6999 {
7000 	spl_t s = splsched();
7001 	thread_lock(thread);
7002 
7003 	assert(!thread_is_eager_preempt(thread));
7004 
7005 	thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
7006 
7007 	if (thread == current_thread()) {
7008 		/* csw_check updates current_is_eagerpreempt on the processor */
7009 		ast_t ast = csw_check(thread, current_processor(), AST_NONE);
7010 
7011 		thread_unlock(thread);
7012 
7013 		if (ast != AST_NONE) {
7014 			thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
7015 		}
7016 	} else {
7017 		processor_t last_processor = thread->last_processor;
7018 
7019 		if (last_processor != PROCESSOR_NULL &&
7020 		    last_processor->state == PROCESSOR_RUNNING &&
7021 		    last_processor->active_thread == thread) {
7022 			cause_ast_check(last_processor);
7023 		}
7024 
7025 		thread_unlock(thread);
7026 	}
7027 
7028 	splx(s);
7029 }
7030 
7031 void
thread_clear_eager_preempt(thread_t thread)7032 thread_clear_eager_preempt(thread_t thread)
7033 {
7034 	spl_t s = splsched();
7035 	thread_lock(thread);
7036 
7037 	assert(thread_is_eager_preempt(thread));
7038 
7039 	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
7040 
7041 	if (thread == current_thread()) {
7042 		current_processor()->current_is_eagerpreempt = false;
7043 	}
7044 
7045 	thread_unlock(thread);
7046 	splx(s);
7047 }
7048 
7049 /*
7050  * Scheduling statistics
7051  */
7052 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)7053 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
7054 {
7055 	struct sched_statistics *stats;
7056 	boolean_t to_realtime = FALSE;
7057 
7058 	stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
7059 	stats->csw_count++;
7060 
7061 	if (otherpri >= BASEPRI_REALTIME) {
7062 		stats->rt_sched_count++;
7063 		to_realtime = TRUE;
7064 	}
7065 
7066 	if ((reasons & AST_PREEMPT) != 0) {
7067 		stats->preempt_count++;
7068 
7069 		if (selfpri >= BASEPRI_REALTIME) {
7070 			stats->preempted_rt_count++;
7071 		}
7072 
7073 		if (to_realtime) {
7074 			stats->preempted_by_rt_count++;
7075 		}
7076 	}
7077 }
7078 
7079 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)7080 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
7081 {
7082 	uint64_t timestamp = mach_absolute_time();
7083 
7084 	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
7085 	stats->last_change_timestamp = timestamp;
7086 }
7087 
7088 /*
7089  *     For calls from assembly code
7090  */
7091 #undef thread_wakeup
7092 void
7093 thread_wakeup(
7094 	event_t         x);
7095 
7096 void
thread_wakeup(event_t x)7097 thread_wakeup(
7098 	event_t         x)
7099 {
7100 	thread_wakeup_with_result(x, THREAD_AWAKENED);
7101 }
7102 
7103 boolean_t
preemption_enabled(void)7104 preemption_enabled(void)
7105 {
7106 	return get_preemption_level() == 0 && ml_get_interrupts_enabled();
7107 }
7108 
7109 static void
sched_timer_deadline_tracking_init(void)7110 sched_timer_deadline_tracking_init(void)
7111 {
7112 	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
7113 	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
7114 }
7115 
7116 /*
7117  * Check that all CPUs are successfully powered up in places where that's expected.
7118  */
7119 static void
check_all_cpus_are_done_starting(processor_start_kind_t start_kind)7120 check_all_cpus_are_done_starting(processor_start_kind_t start_kind)
7121 {
7122 	/*
7123 	 * `processor_count` may include registered CPUs above cpus= or cpumask= limit.
7124 	 * Use machine_info.logical_cpu_max for the CPU IDs that matter.
7125 	 */
7126 	for (int cpu_id = 0; cpu_id < machine_info.logical_cpu_max; cpu_id++) {
7127 		processor_t processor = processor_array[cpu_id];
7128 		processor_wait_for_start(processor, start_kind);
7129 	}
7130 }
7131 
7132 /*
7133  * Find some available online CPU that threads can be enqueued on
7134  *
7135  * Called with the sched_available_cores_lock held
7136  */
7137 static int
sched_last_resort_cpu(void)7138 sched_last_resort_cpu(void)
7139 {
7140 	simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
7141 
7142 	int last_resort_cpu = lsb_first(pcs.pcs_effective.pcs_online_cores);
7143 
7144 	if (last_resort_cpu == -1) {
7145 		panic("no last resort cpu found!");
7146 	}
7147 
7148 	return last_resort_cpu;
7149 }
7150 
7151 
7152 static void
assert_no_processors_in_transition_locked()7153 assert_no_processors_in_transition_locked()
7154 {
7155 	assert(pcs.pcs_in_kernel_sleep == false);
7156 
7157 	/* All processors must be either running or offline */
7158 	assert(pcs.pcs_managed_cores ==
7159 	    (processor_offline_state_map[PROCESSOR_OFFLINE_RUNNING] |
7160 	    processor_offline_state_map[PROCESSOR_OFFLINE_FULLY_OFFLINE]));
7161 
7162 	/* All state transitions must be quiesced at this point */
7163 	assert(pcs.pcs_effective.pcs_online_cores ==
7164 	    processor_offline_state_map[PROCESSOR_OFFLINE_RUNNING]);
7165 }
7166 
7167 static struct powered_cores_state
sched_compute_requested_powered_cores()7168 sched_compute_requested_powered_cores()
7169 {
7170 	simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
7171 
7172 	struct powered_cores_state output = {
7173 		.pcs_online_cores = pcs.pcs_managed_cores,
7174 		.pcs_powerdown_recommended_cores = pcs.pcs_managed_cores,
7175 		.pcs_tempdown_cores = 0,
7176 	};
7177 
7178 	if (!pcs.pcs_init_completed) {
7179 		return output;
7180 	}
7181 
7182 	/*
7183 	 * if we unify this with derecommendation, note that only sleep should stop derecommendation,
7184 	 * not dtrace et al
7185 	 */
7186 	if (pcs.pcs_powerdown_suspend_count) {
7187 		return output;
7188 	} else {
7189 		/*
7190 		 * The cores power clients like ANE require or
7191 		 * the kernel cannot offline
7192 		 */
7193 		cpumap_t system_required_powered_cores = pcs.pcs_required_online_pmgr |
7194 		    pcs.pcs_required_online_system;
7195 
7196 		cpumap_t online_cores_goal;
7197 
7198 		if (pcs.pcs_user_online_core_control) {
7199 			/* This is our new goal state for powered cores */
7200 			output.pcs_powerdown_recommended_cores = pcs.pcs_requested_online_user;
7201 			online_cores_goal = pcs.pcs_requested_online_user | system_required_powered_cores;
7202 		} else {
7203 			/* Remove the cores CLPC wants to power down */
7204 			cpumap_t clpc_wanted_powered_cores = pcs.pcs_managed_cores;
7205 			clpc_wanted_powered_cores &= pcs.pcs_requested_online_clpc_user;
7206 			clpc_wanted_powered_cores &= pcs.pcs_requested_online_clpc_system;
7207 
7208 			output.pcs_powerdown_recommended_cores = clpc_wanted_powered_cores;
7209 			online_cores_goal = clpc_wanted_powered_cores | system_required_powered_cores;
7210 
7211 			/* Any cores in managed cores that are not in wanted powered become temporary */
7212 			output.pcs_tempdown_cores = (pcs.pcs_managed_cores & ~clpc_wanted_powered_cores);
7213 
7214 			/* Future: Treat CLPC user/system separately. */
7215 		}
7216 
7217 		if (online_cores_goal == 0) {
7218 			/*
7219 			 * If we're somehow trying to disable all CPUs,
7220 			 * force online the lowest numbered CPU.
7221 			 */
7222 			online_cores_goal = BIT(lsb_first(pcs.pcs_managed_cores));
7223 		}
7224 
7225 #if RHODES_CLUSTER_POWERDOWN_WORKAROUND
7226 		/*
7227 		 * Because warm CPU boot from WFI is not currently implemented,
7228 		 * we cannot power down only one CPU in a cluster, so we force up
7229 		 * all the CPUs in the cluster if any one CPU is up in the cluster.
7230 		 * Once all CPUs are disabled, then the whole cluster goes down at once.
7231 		 */
7232 
7233 		cpumap_t workaround_online_cores = 0;
7234 
7235 		const ml_topology_info_t* topology = ml_get_topology_info();
7236 		for (unsigned int i = 0; i < topology->num_clusters; i++) {
7237 			ml_topology_cluster_t* cluster = &topology->clusters[i];
7238 			if ((cluster->cpu_mask & online_cores_goal) != 0) {
7239 				workaround_online_cores |= cluster->cpu_mask;
7240 			}
7241 		}
7242 
7243 		online_cores_goal = workaround_online_cores;
7244 #endif /* RHODES_CLUSTER_POWERDOWN_WORKAROUND */
7245 
7246 		output.pcs_online_cores = online_cores_goal;
7247 	}
7248 
7249 	return output;
7250 }
7251 
7252 static bool
sched_needs_update_requested_powered_cores()7253 sched_needs_update_requested_powered_cores()
7254 {
7255 	if (!pcs.pcs_init_completed) {
7256 		return false;
7257 	}
7258 
7259 	struct powered_cores_state requested = sched_compute_requested_powered_cores();
7260 
7261 	struct powered_cores_state effective = pcs.pcs_effective;
7262 
7263 	if (requested.pcs_powerdown_recommended_cores != effective.pcs_powerdown_recommended_cores ||
7264 	    requested.pcs_online_cores != effective.pcs_online_cores ||
7265 	    requested.pcs_tempdown_cores != effective.pcs_tempdown_cores) {
7266 		return true;
7267 	} else {
7268 		return false;
7269 	}
7270 }
7271 
7272 kern_return_t
sched_processor_exit_user(processor_t processor)7273 sched_processor_exit_user(processor_t processor)
7274 {
7275 	assert(processor);
7276 
7277 	lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
7278 	assert(preemption_enabled());
7279 
7280 	kern_return_t result;
7281 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
7282 
7283 	spl_t s = splsched();
7284 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7285 
7286 	if (!enable_processor_exit) {
7287 		/* This API is not supported on this device. */
7288 		result = KERN_NOT_SUPPORTED;
7289 		goto unlock;
7290 	}
7291 
7292 	if (bit_test(pcs.pcs_required_online_system, processor->cpu_id)) {
7293 		/* This CPU can never change state outside of sleep. */
7294 		result = KERN_NOT_SUPPORTED;
7295 		goto unlock;
7296 	}
7297 
7298 	/*
7299 	 * Future: Instead of failing, simulate the processor
7300 	 * being shut down via derecommendation and decrementing active count.
7301 	 */
7302 	if (bit_test(pcs.pcs_required_online_pmgr, processor->cpu_id)) {
7303 		/* PMGR won't let us power down this CPU right now. */
7304 		result = KERN_FAILURE;
7305 		goto unlock;
7306 	}
7307 
7308 	if (pcs.pcs_powerdown_suspend_count) {
7309 		/* A tool that disables CPU powerdown is active. */
7310 		result = KERN_FAILURE;
7311 		goto unlock;
7312 	}
7313 
7314 	if (!bit_test(pcs.pcs_requested_online_user, processor->cpu_id)) {
7315 		/* The CPU is already powered off by userspace. */
7316 		result = KERN_NODE_DOWN;
7317 		goto unlock;
7318 	}
7319 
7320 	if ((pcs.pcs_recommended_cores & pcs.pcs_effective.pcs_online_cores) == BIT(processor->cpu_id)) {
7321 		/* This is the last available core, can't shut it down. */
7322 		result = KERN_RESOURCE_SHORTAGE;
7323 		goto unlock;
7324 	}
7325 
7326 	result = KERN_SUCCESS;
7327 
7328 	if (!pcs.pcs_user_online_core_control) {
7329 		pcs.pcs_user_online_core_control = true;
7330 	}
7331 
7332 	bit_clear(pcs.pcs_requested_online_user, processor->cpu_id);
7333 
7334 	if (sched_needs_update_requested_powered_cores()) {
7335 		threadq = sched_update_powered_cores_drops_lock(REASON_USER, s, threadq);
7336 	}
7337 
7338 unlock:
7339 	simple_unlock(&sched_available_cores_lock);
7340 	splx(s);
7341 
7342 	pulled_thread_queue_flush(threadq);
7343 
7344 	return result;
7345 }
7346 
7347 kern_return_t
sched_processor_start_user(processor_t processor)7348 sched_processor_start_user(processor_t processor)
7349 {
7350 	assert(processor);
7351 
7352 	lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
7353 	assert(preemption_enabled());
7354 
7355 	kern_return_t result;
7356 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
7357 
7358 	spl_t s = splsched();
7359 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7360 
7361 	if (!enable_processor_exit) {
7362 		result = KERN_NOT_SUPPORTED;
7363 		goto unlock;
7364 	}
7365 
7366 	if (bit_test(pcs.pcs_required_online_system, processor->cpu_id)) {
7367 		result = KERN_NOT_SUPPORTED;
7368 		goto unlock;
7369 	}
7370 
7371 #if CONFIG_SCHED_SMT
7372 	/* Not allowed to start an SMT processor while SMT is disabled */
7373 	if ((sched_enable_smt == 0) && (processor->processor_primary != processor)) {
7374 		result = KERN_FAILURE;
7375 		goto unlock;
7376 	}
7377 #endif /* CONFIG_SCHED_SMT */
7378 
7379 	if (pcs.pcs_powerdown_suspend_count) {
7380 		result = KERN_FAILURE;
7381 		goto unlock;
7382 	}
7383 
7384 	if (bit_test(pcs.pcs_requested_online_user, processor->cpu_id)) {
7385 		result = KERN_FAILURE;
7386 		goto unlock;
7387 	}
7388 
7389 	result = KERN_SUCCESS;
7390 
7391 	bit_set(pcs.pcs_requested_online_user, processor->cpu_id);
7392 
7393 	/*
7394 	 * Once the user puts all CPUs back online,
7395 	 * we can resume automatic cluster power down.
7396 	 */
7397 	if (pcs.pcs_requested_online_user == pcs.pcs_managed_cores) {
7398 		pcs.pcs_user_online_core_control = false;
7399 	}
7400 
7401 	if (sched_needs_update_requested_powered_cores()) {
7402 		threadq = sched_update_powered_cores_drops_lock(REASON_USER, s, threadq);
7403 	}
7404 
7405 unlock:
7406 	simple_unlock(&sched_available_cores_lock);
7407 	splx(s);
7408 
7409 	pulled_thread_queue_flush(threadq);
7410 
7411 	return result;
7412 }
7413 
7414 sched_cond_atomic_t sched_update_powered_cores_wakeup;
7415 thread_t sched_update_powered_cores_thread;
7416 
7417 
7418 static void OS_NORETURN sched_update_powered_cores_continue(void *param __unused, wait_result_t wr __unused);
7419 
7420 /*
7421  * After all processors have been ml_processor_register'ed and processor_boot'ed
7422  * the scheduler can finalize its datastructures and allow CPU power state changes.
7423  *
7424  * Enforce that this only happens *once*. More than once is definitely not OK. rdar://121270513
7425  */
7426 void
sched_cpu_init_completed(void)7427 sched_cpu_init_completed(void)
7428 {
7429 	static bool sched_cpu_init_completed_called = false;
7430 
7431 	if (!os_atomic_cmpxchg(&sched_cpu_init_completed_called, false, true, relaxed)) {
7432 		panic("sched_cpu_init_completed called twice! %d", sched_cpu_init_completed_called);
7433 	}
7434 
7435 	if (SCHED(cpu_init_completed) != NULL) {
7436 		SCHED(cpu_init_completed)();
7437 	}
7438 
7439 	SCHED(rt_init_completed)();
7440 
7441 	/* Wait for any cpu that is still starting, and enforce that they eventually complete. */
7442 	check_all_cpus_are_done_starting(PROCESSOR_FIRST_BOOT);
7443 
7444 	lck_mtx_lock(&cluster_powerdown_lock);
7445 
7446 	assert(sched_update_powered_cores_thread == THREAD_NULL);
7447 
7448 	sched_cond_init(&sched_update_powered_cores_wakeup);
7449 
7450 	kern_return_t result = kernel_thread_start_priority(
7451 		sched_update_powered_cores_continue,
7452 		NULL, MAXPRI_KERNEL, &sched_update_powered_cores_thread);
7453 	if (result != KERN_SUCCESS) {
7454 		panic("failed to create sched_update_powered_cores thread");
7455 	}
7456 
7457 	thread_set_thread_name(sched_update_powered_cores_thread,
7458 	    "sched_update_powered_cores");
7459 
7460 	spl_t s = splsched();
7461 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7462 
7463 	assert(pcs.pcs_init_completed == false);
7464 
7465 	pcs.pcs_managed_cores = pcs.pcs_effective.pcs_online_cores;
7466 
7467 	assert(__builtin_popcountll(pcs.pcs_managed_cores) == machine_info.logical_cpu_max);
7468 
7469 	/* If CLPC tries to cluster power down before this point, it's ignored. */
7470 	pcs.pcs_requested_online_user = pcs.pcs_managed_cores;
7471 	pcs.pcs_requested_online_clpc_system = pcs.pcs_managed_cores;
7472 	pcs.pcs_requested_online_clpc_user = pcs.pcs_managed_cores;
7473 
7474 	cpumap_t system_required_cores = 0;
7475 
7476 	/*
7477 	 * Ask the platform layer which CPUs are allowed to
7478 	 * be powered off outside of system sleep.
7479 	 */
7480 	for (int cpu_id = 0; cpu_id < machine_info.logical_cpu_max; cpu_id++) {
7481 		if (!ml_cpu_can_exit(cpu_id)) {
7482 			bit_set(system_required_cores, cpu_id);
7483 		}
7484 	}
7485 
7486 	pcs.pcs_required_online_system = system_required_cores;
7487 	pcs.pcs_effective.pcs_powerdown_recommended_cores = pcs.pcs_managed_cores;
7488 
7489 	pcs.pcs_requested = sched_compute_requested_powered_cores();
7490 
7491 	assert(pcs.pcs_requested.pcs_powerdown_recommended_cores == pcs.pcs_managed_cores);
7492 	assert(pcs.pcs_requested.pcs_online_cores == pcs.pcs_managed_cores);
7493 	assert(pcs.pcs_requested.pcs_tempdown_cores == 0);
7494 
7495 	assert(pcs.pcs_effective.pcs_powerdown_recommended_cores == pcs.pcs_managed_cores);
7496 	assert(pcs.pcs_effective.pcs_online_cores == pcs.pcs_managed_cores);
7497 	assert(pcs.pcs_effective.pcs_tempdown_cores == 0);
7498 
7499 	pcs.pcs_init_completed = true;
7500 
7501 	simple_unlock(&sched_available_cores_lock);
7502 	splx(s);
7503 
7504 	lck_mtx_unlock(&cluster_powerdown_lock);
7505 
7506 	/* Release the +1 pcs_powerdown_suspend_count that we booted up with. */
7507 	resume_cluster_powerdown();
7508 }
7509 
7510 bool
sched_is_in_sleep(void)7511 sched_is_in_sleep(void)
7512 {
7513 	return pcs.pcs_in_kernel_sleep || pcs.pcs_wants_kernel_sleep;
7514 }
7515 
7516 bool
sched_is_cpu_init_completed(void)7517 sched_is_cpu_init_completed(void)
7518 {
7519 	return pcs.pcs_init_completed;
7520 }
7521 
7522 processor_reason_t last_sched_update_powered_cores_continue_reason;
7523 
7524 static void OS_NORETURN
sched_update_powered_cores_continue(void * param __unused,wait_result_t wr __unused)7525 sched_update_powered_cores_continue(void *param __unused, wait_result_t wr __unused)
7526 {
7527 	sched_cond_ack(&sched_update_powered_cores_wakeup);
7528 
7529 	while (true) {
7530 		lck_mtx_lock(&cluster_powerdown_lock);
7531 
7532 		struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
7533 
7534 		spl_t s = splsched();
7535 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7536 
7537 		bool needs_update = sched_needs_update_requested_powered_cores();
7538 
7539 		if (needs_update) {
7540 			/* This thread shouldn't need to make changes while powerdown is suspended */
7541 			assert(pcs.pcs_powerdown_suspend_count == 0);
7542 
7543 			processor_reason_t reason = last_sched_update_powered_cores_continue_reason;
7544 
7545 			threadq = sched_update_powered_cores_drops_lock(reason, s, threadq);
7546 		}
7547 
7548 		simple_unlock(&sched_available_cores_lock);
7549 		splx(s);
7550 
7551 		pulled_thread_queue_flush(threadq);
7552 
7553 		lck_mtx_unlock(&cluster_powerdown_lock);
7554 
7555 		/* If we did an update, we dropped the lock, so check again. */
7556 
7557 		if (!needs_update) {
7558 			sched_cond_wait(&sched_update_powered_cores_wakeup, THREAD_UNINT,
7559 			    sched_update_powered_cores_continue);
7560 			/* The condition was signaled since we last blocked, check again. */
7561 		}
7562 	}
7563 }
7564 
7565 __options_decl(sched_powered_cores_flags_t, uint32_t, {
7566 	ASSERT_IN_SLEEP                 = 0x10000000,
7567 	ASSERT_POWERDOWN_SUSPENDED      = 0x20000000,
7568 	POWERED_CORES_OPTIONS_MASK      = ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED,
7569 });
7570 
7571 /*
7572  * This is KPI with CLPC.
7573  */
7574 void
sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,__unused uint32_t flags)7575 sched_perfcontrol_update_powered_cores(
7576 	uint64_t requested_powered_cores,
7577 	processor_reason_t reason,
7578 	__unused uint32_t flags)
7579 {
7580 	assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7581 
7582 #if DEVELOPMENT || DEBUG
7583 	if (flags & (ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED)) {
7584 		if (flags & ASSERT_POWERDOWN_SUSPENDED) {
7585 			assert(pcs.pcs_powerdown_suspend_count > 0);
7586 		}
7587 		if (flags & ASSERT_IN_SLEEP) {
7588 			assert(pcs.pcs_sleep_override_recommended == true);
7589 		}
7590 		return;
7591 	}
7592 #endif
7593 
7594 	spl_t s = splsched();
7595 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7596 
7597 	cpumap_t requested_cores = requested_powered_cores & pcs.pcs_managed_cores;
7598 
7599 	if (reason == REASON_CLPC_SYSTEM) {
7600 		pcs.pcs_requested_online_clpc_system = requested_cores;
7601 	} else if (reason == REASON_CLPC_USER) {
7602 		pcs.pcs_requested_online_clpc_user = requested_cores;
7603 	}
7604 
7605 	bool needs_update = sched_needs_update_requested_powered_cores();
7606 
7607 	if (needs_update) {
7608 		last_sched_update_powered_cores_continue_reason = reason;
7609 	}
7610 
7611 	simple_unlock(&sched_available_cores_lock);
7612 	splx(s);
7613 
7614 	if (needs_update) {
7615 		sched_cond_signal(&sched_update_powered_cores_wakeup,
7616 		    sched_update_powered_cores_thread);
7617 	}
7618 }
7619 
7620 /*
7621  * The performance controller invokes this method to reevaluate a thread
7622  * placement on the processor cpu_id when the per-core timer expires to force
7623  * a preemption if necessary.
7624  */
7625 bool
sched_perfcontrol_check_oncore_thread_preemption(__unused uint64_t flags,int cpu_id __assert_only)7626 sched_perfcontrol_check_oncore_thread_preemption(
7627 	__unused uint64_t flags,
7628 	int cpu_id __assert_only)
7629 {
7630 	bool ret = false;
7631 	assert(ml_get_interrupts_enabled() == false);
7632 
7633 	processor_t processor = current_processor();
7634 	thread_t thread = current_thread();
7635 	assert(processor->cpu_id == cpu_id);
7636 
7637 	thread_lock(thread);
7638 	ast_t preempt = csw_check(thread, processor, AST_NONE);
7639 	if (preempt != AST_NONE) {
7640 		/*
7641 		 * TODO: Returning true here is best effort and isn't guaranteed to preempt the thread since thread_select can
7642 		 * choose to leave the thread on the same processor. Consider using the flags passed in here to callback into
7643 		 * CLPC before the next scheduling decision point (or sampler tick) if this decision needs to be reevaluated or
7644 		 * to otherwise adjust this behavior.
7645 		 */
7646 		ret = true;
7647 		ast_on(preempt);
7648 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_ONCORE_PREEMPT), thread_tid(thread), processor->cpu_id, 0, 0, 0);
7649 	}
7650 	thread_unlock(thread);
7651 
7652 	return ret;
7653 }
7654 
7655 /*
7656  * This doesn't just suspend cluster powerdown.
7657  * It also powers up all the cores and leaves them up,
7658  * even if some user wanted them down.
7659  * This is important because dtrace, monotonic, and others can't handle any
7660  * powered down cores, not just cluster powerdown.
7661  */
7662 static void
suspend_cluster_powerdown_locked(bool for_sleep)7663 suspend_cluster_powerdown_locked(bool for_sleep)
7664 {
7665 	lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
7666 	kprintf("%s>calling sched_update_powered_cores to suspend powerdown\n", __func__);
7667 
7668 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
7669 
7670 	spl_t s = splsched();
7671 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7672 
7673 	assert(pcs.pcs_powerdown_suspend_count >= 0);
7674 
7675 	if (for_sleep) {
7676 		assert(!pcs.pcs_wants_kernel_sleep);
7677 		assert(!pcs.pcs_in_kernel_sleep);
7678 		pcs.pcs_wants_kernel_sleep = true;
7679 	}
7680 
7681 	pcs.pcs_powerdown_suspend_count++;
7682 
7683 	if (sched_needs_update_requested_powered_cores()) {
7684 		threadq = sched_update_powered_cores_drops_lock(REASON_SYSTEM, s, threadq);
7685 	}
7686 
7687 	if (for_sleep) {
7688 		assert(pcs.pcs_wants_kernel_sleep);
7689 		assert(!pcs.pcs_in_kernel_sleep);
7690 		pcs.pcs_in_kernel_sleep = true;
7691 
7692 		assert(sched_needs_update_requested_powered_cores() == false);
7693 	}
7694 
7695 	simple_unlock(&sched_available_cores_lock);
7696 	splx(s);
7697 
7698 	pulled_thread_queue_flush(threadq);
7699 
7700 	if (pcs.pcs_init_completed) {
7701 		/* At this point, no cpu should be still starting. Let's enforce that. */
7702 		check_all_cpus_are_done_starting(for_sleep ?
7703 		    PROCESSOR_BEFORE_ENTERING_SLEEP : PROCESSOR_CLUSTER_POWERDOWN_SUSPEND);
7704 	}
7705 }
7706 
7707 static void
resume_cluster_powerdown_locked(bool for_sleep)7708 resume_cluster_powerdown_locked(bool for_sleep)
7709 {
7710 	lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
7711 
7712 	if (pcs.pcs_init_completed) {
7713 		/* At this point, no cpu should be still starting. Let's enforce that. */
7714 		check_all_cpus_are_done_starting(for_sleep ?
7715 		    PROCESSOR_WAKE_FROM_SLEEP : PROCESSOR_CLUSTER_POWERDOWN_RESUME);
7716 	}
7717 
7718 	kprintf("%s>calling sched_update_powered_cores to resume powerdown\n", __func__);
7719 
7720 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
7721 
7722 	spl_t s = splsched();
7723 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7724 
7725 	if (pcs.pcs_powerdown_suspend_count <= 0) {
7726 		panic("resume_cluster_powerdown() called with pcs.pcs_powerdown_suspend_count=%d\n", pcs.pcs_powerdown_suspend_count);
7727 	}
7728 
7729 	if (for_sleep) {
7730 		assert(pcs.pcs_wants_kernel_sleep);
7731 		assert(pcs.pcs_in_kernel_sleep);
7732 		pcs.pcs_wants_kernel_sleep = false;
7733 	}
7734 
7735 	pcs.pcs_powerdown_suspend_count--;
7736 
7737 	if (pcs.pcs_powerdown_suspend_count == 0) {
7738 		/* Returning to client controlled powerdown mode */
7739 		assert(pcs.pcs_init_completed);
7740 
7741 		/* To match previous behavior, clear the user state */
7742 		pcs.pcs_requested_online_user = pcs.pcs_managed_cores;
7743 		pcs.pcs_user_online_core_control = false;
7744 
7745 		/* To match previous behavior, clear the requested CLPC state. */
7746 		pcs.pcs_requested_online_clpc_user = pcs.pcs_managed_cores;
7747 		pcs.pcs_requested_online_clpc_system = pcs.pcs_managed_cores;
7748 	}
7749 
7750 	if (sched_needs_update_requested_powered_cores()) {
7751 		threadq = sched_update_powered_cores_drops_lock(REASON_SYSTEM, s, threadq);
7752 	}
7753 
7754 	if (for_sleep) {
7755 		assert(!pcs.pcs_wants_kernel_sleep);
7756 		assert(pcs.pcs_in_kernel_sleep);
7757 		pcs.pcs_in_kernel_sleep = false;
7758 
7759 		assert(sched_needs_update_requested_powered_cores() == false);
7760 	}
7761 
7762 	simple_unlock(&sched_available_cores_lock);
7763 	splx(s);
7764 
7765 	pulled_thread_queue_flush(threadq);
7766 }
7767 
7768 static uint64_t
die_and_cluster_to_cpu_mask(__unused unsigned int die_id,__unused unsigned int die_cluster_id)7769 die_and_cluster_to_cpu_mask(
7770 	__unused unsigned int die_id,
7771 	__unused unsigned int die_cluster_id)
7772 {
7773 #if __arm__ || __arm64__
7774 	const ml_topology_info_t* topology = ml_get_topology_info();
7775 	unsigned int num_clusters = topology->num_clusters;
7776 	for (unsigned int i = 0; i < num_clusters; i++) {
7777 		ml_topology_cluster_t* cluster = &topology->clusters[i];
7778 		if ((cluster->die_id == die_id) &&
7779 		    (cluster->die_cluster_id == die_cluster_id)) {
7780 			return cluster->cpu_mask;
7781 		}
7782 	}
7783 #endif
7784 	return 0ull;
7785 }
7786 
7787 /*
7788  * Take an assertion that ensures all CPUs in the cluster are powered up until
7789  * the assertion is released.
7790  * A system suspend will still power down the CPUs.
7791  * This call will stall if system suspend is in progress.
7792  *
7793  * Future ER: Could this just power up the cluster, and leave enabling the
7794  * processors to be asynchronous, or deferred?
7795  *
7796  * Enabling the rail is synchronous, it must be powered up before returning.
7797  */
7798 void
sched_enable_acc_rail(unsigned int die_id,unsigned int die_cluster_id)7799 sched_enable_acc_rail(unsigned int die_id, unsigned int die_cluster_id)
7800 {
7801 	uint64_t core_mask = die_and_cluster_to_cpu_mask(die_id, die_cluster_id);
7802 
7803 	lck_mtx_lock(&cluster_powerdown_lock);
7804 
7805 	/*
7806 	 * Note: if pcs.pcs_init_completed is false, because the
7807 	 * CPUs have not booted yet, then we assume that all
7808 	 * clusters are already powered up at boot (see IOCPUInitialize)
7809 	 * so we don't have to wait for cpu boot to complete.
7810 	 * We'll still save the requested assertion and enforce it after
7811 	 * boot completes.
7812 	 */
7813 
7814 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
7815 
7816 	spl_t s = splsched();
7817 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7818 
7819 	if (pcs.pcs_init_completed) {
7820 		assert3u(pcs.pcs_managed_cores & core_mask, ==, core_mask);
7821 	}
7822 
7823 	/* Can't enable something that is already enabled */
7824 	assert((pcs.pcs_required_online_pmgr & core_mask) == 0);
7825 
7826 	pcs.pcs_required_online_pmgr |= core_mask;
7827 
7828 	if (sched_needs_update_requested_powered_cores()) {
7829 		threadq = sched_update_powered_cores_drops_lock(REASON_PMGR_SYSTEM, s, threadq);
7830 	}
7831 
7832 	simple_unlock(&sched_available_cores_lock);
7833 	splx(s);
7834 
7835 	pulled_thread_queue_flush(threadq);
7836 
7837 	lck_mtx_unlock(&cluster_powerdown_lock);
7838 }
7839 
7840 /*
7841  * Release the assertion ensuring the cluster is powered up.
7842  * This operation is asynchronous, so PMGR doesn't need to wait until it takes
7843  * effect. If the enable comes in before it takes effect, it'll either
7844  * wait on the lock, or the async thread will discover it needs no update.
7845  */
7846 void
sched_disable_acc_rail(unsigned int die_id,unsigned int die_cluster_id)7847 sched_disable_acc_rail(unsigned int die_id, unsigned int die_cluster_id)
7848 {
7849 	uint64_t core_mask = die_and_cluster_to_cpu_mask(die_id, die_cluster_id);
7850 
7851 	spl_t s = splsched();
7852 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7853 
7854 	/* Can't disable something that is already disabled */
7855 	assert((pcs.pcs_required_online_pmgr & core_mask) == core_mask);
7856 
7857 	if (pcs.pcs_init_completed) {
7858 		assert3u(pcs.pcs_managed_cores & core_mask, ==, core_mask);
7859 	}
7860 
7861 	pcs.pcs_required_online_pmgr &= ~core_mask;
7862 
7863 	bool needs_update = sched_needs_update_requested_powered_cores();
7864 
7865 	if (needs_update) {
7866 		last_sched_update_powered_cores_continue_reason = REASON_PMGR_SYSTEM;
7867 	}
7868 
7869 	simple_unlock(&sched_available_cores_lock);
7870 	splx(s);
7871 
7872 	if (needs_update) {
7873 		sched_cond_signal(&sched_update_powered_cores_wakeup,
7874 		    sched_update_powered_cores_thread);
7875 	}
7876 }
7877 
7878 void
suspend_cluster_powerdown(void)7879 suspend_cluster_powerdown(void)
7880 {
7881 	lck_mtx_lock(&cluster_powerdown_lock);
7882 	suspend_cluster_powerdown_locked(false);
7883 	lck_mtx_unlock(&cluster_powerdown_lock);
7884 }
7885 
7886 void
resume_cluster_powerdown(void)7887 resume_cluster_powerdown(void)
7888 {
7889 	lck_mtx_lock(&cluster_powerdown_lock);
7890 	resume_cluster_powerdown_locked(false);
7891 	lck_mtx_unlock(&cluster_powerdown_lock);
7892 
7893 #if CONFIG_SCHED_SMT
7894 	if (sched_enable_smt == 0) {
7895 		enable_smt_processors(false);
7896 	}
7897 #endif /* CONFIG_SCHED_SMT */
7898 }
7899 
7900 
7901 LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp);
7902 static bool user_suspended_cluster_powerdown = false;
7903 
7904 kern_return_t
suspend_cluster_powerdown_from_user(void)7905 suspend_cluster_powerdown_from_user(void)
7906 {
7907 	kern_return_t ret = KERN_FAILURE;
7908 
7909 	lck_mtx_lock(&user_cluster_powerdown_lock);
7910 
7911 	if (!user_suspended_cluster_powerdown) {
7912 		suspend_cluster_powerdown();
7913 		user_suspended_cluster_powerdown = true;
7914 		ret = KERN_SUCCESS;
7915 	}
7916 
7917 	lck_mtx_unlock(&user_cluster_powerdown_lock);
7918 
7919 	return ret;
7920 }
7921 
7922 kern_return_t
resume_cluster_powerdown_from_user(void)7923 resume_cluster_powerdown_from_user(void)
7924 {
7925 	kern_return_t ret = KERN_FAILURE;
7926 
7927 	lck_mtx_lock(&user_cluster_powerdown_lock);
7928 
7929 	if (user_suspended_cluster_powerdown) {
7930 		resume_cluster_powerdown();
7931 		user_suspended_cluster_powerdown = false;
7932 		ret = KERN_SUCCESS;
7933 	}
7934 
7935 	lck_mtx_unlock(&user_cluster_powerdown_lock);
7936 
7937 	return ret;
7938 }
7939 
7940 int
get_cluster_powerdown_user_suspended(void)7941 get_cluster_powerdown_user_suspended(void)
7942 {
7943 	lck_mtx_lock(&user_cluster_powerdown_lock);
7944 
7945 	int ret = (int)user_suspended_cluster_powerdown;
7946 
7947 	lck_mtx_unlock(&user_cluster_powerdown_lock);
7948 
7949 	return ret;
7950 }
7951 
7952 #if DEVELOPMENT || DEBUG
7953 /* Functions to support the temporary sysctl */
7954 static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED;
7955 void
sched_set_powered_cores(int requested_powered_cores)7956 sched_set_powered_cores(int requested_powered_cores)
7957 {
7958 	processor_reason_t reason = bit_test(requested_powered_cores, 31) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM;
7959 	sched_powered_cores_flags_t flags = requested_powered_cores & POWERED_CORES_OPTIONS_MASK;
7960 
7961 	saved_requested_powered_cores = requested_powered_cores;
7962 
7963 	requested_powered_cores = bits(requested_powered_cores, 28, 0);
7964 
7965 	sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags);
7966 }
7967 int
sched_get_powered_cores(void)7968 sched_get_powered_cores(void)
7969 {
7970 	return (int)saved_requested_powered_cores;
7971 }
7972 
7973 uint64_t
sched_sysctl_get_recommended_cores(void)7974 sched_sysctl_get_recommended_cores(void)
7975 {
7976 	return pcs.pcs_recommended_cores;
7977 }
7978 #endif
7979 
7980 /*
7981  * Ensure that all cores are powered and recommended before sleep
7982  * Acquires cluster_powerdown_lock and returns with it held.
7983  */
7984 void
sched_override_available_cores_for_sleep(void)7985 sched_override_available_cores_for_sleep(void)
7986 {
7987 	if (!pcs.pcs_init_completed) {
7988 		panic("Attempting to sleep before all CPUS are registered");
7989 	}
7990 
7991 	lck_mtx_lock(&cluster_powerdown_lock);
7992 
7993 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
7994 
7995 	spl_t s = splsched();
7996 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7997 
7998 	assert(pcs.pcs_sleep_override_recommended == false);
7999 
8000 	pcs.pcs_sleep_override_recommended = true;
8001 	sched_update_recommended_cores_locked(REASON_SYSTEM, 0, threadq);
8002 
8003 	simple_unlock(&sched_available_cores_lock);
8004 	splx(s);
8005 
8006 	pulled_thread_queue_flush(threadq);
8007 
8008 	suspend_cluster_powerdown_locked(true);
8009 }
8010 
8011 /*
8012  * Restore the previously recommended cores, but leave all cores powered
8013  * after sleep.
8014  * Called with cluster_powerdown_lock still held, releases the lock.
8015  */
8016 void
sched_restore_available_cores_after_sleep(void)8017 sched_restore_available_cores_after_sleep(void)
8018 {
8019 	lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
8020 
8021 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
8022 
8023 	spl_t s = splsched();
8024 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8025 	assert(pcs.pcs_sleep_override_recommended == true);
8026 
8027 	pcs.pcs_sleep_override_recommended = false;
8028 	sched_update_recommended_cores_locked(REASON_NONE, 0, threadq);
8029 
8030 	simple_unlock(&sched_available_cores_lock);
8031 	splx(s);
8032 
8033 	pulled_thread_queue_flush(threadq);
8034 
8035 	resume_cluster_powerdown_locked(true);
8036 
8037 	lck_mtx_unlock(&cluster_powerdown_lock);
8038 
8039 #if CONFIG_SCHED_SMT
8040 	if (sched_enable_smt == 0) {
8041 		enable_smt_processors(false);
8042 	}
8043 #endif /* CONFIG_SCHED_SMT */
8044 }
8045 
8046 /*
8047  * Technically we could avoid passing this pointer around and instead
8048  * only look at current_processor, but having a token to show where and when
8049  * it is used enforces correctness and clarity of the preemption disabled region.
8050  *
8051  * processor_threadq_interrupt handles the case where this is called in a context
8052  * where we could have interrupted another in-flight pulled_thread_queue operation
8053  * that merely had preemption disabled, so we need to use a separate instance
8054  * of the queue in order to not conflict with it.
8055  */
8056 struct pulled_thread_queue *
pulled_thread_queue_prepare(void)8057 pulled_thread_queue_prepare(void)
8058 {
8059 	struct pulled_thread_queue *threadq;
8060 
8061 	if (ml_get_interrupts_enabled() == false) {
8062 		threadq = &current_processor()->processor_threadq_interrupt;
8063 	} else {
8064 		/* paired with enable inside pulled_thread_queue_flush */
8065 		disable_preemption();
8066 		threadq = &current_processor()->processor_threadq;
8067 	}
8068 
8069 	assert(threadq->ptq_queue_active == false);
8070 	threadq->ptq_queue_active = true;
8071 
8072 	return threadq;
8073 }
8074 
8075 #if SCHED_HYGIENE_DEBUG
8076 extern uint32_t waitq_flush_excess_threads;
8077 extern uint32_t waitq_flush_excess_time_mt;
8078 #endif /* SCHED_HYGIENE_DEBUG */
8079 
8080 void
pulled_thread_queue_flush(struct pulled_thread_queue * threadq)8081 pulled_thread_queue_flush(struct pulled_thread_queue *threadq)
8082 {
8083 	assert(!preemption_enabled());
8084 
8085 	bool in_interrupt;
8086 	if (threadq == &current_processor()->processor_threadq_interrupt) {
8087 		assert(ml_get_interrupts_enabled() == false);
8088 		in_interrupt = true;
8089 	} else {
8090 		assert3p(threadq, ==, &current_processor()->processor_threadq);
8091 		in_interrupt = false;
8092 	}
8093 
8094 	assert(threadq->ptq_queue_active == true);
8095 
8096 	if (circle_queue_empty(&threadq->ptq_threadq) && threadq->ptq_needs_smr_cpu_down == 0) {
8097 		threadq->ptq_queue_active = false;
8098 		if (!in_interrupt) {
8099 			/* match the disable from pulled_thread_queue_prepare */
8100 			enable_preemption();
8101 		}
8102 		return;
8103 	}
8104 
8105 	thread_t thread = THREAD_NULL;
8106 
8107 	int flushed_threads = 0;
8108 
8109 #if SCHED_HYGIENE_DEBUG
8110 	uint64_t start_time = ml_get_sched_hygiene_timebase();
8111 #endif /* SCHED_HYGIENE_DEBUG */
8112 
8113 	cqe_foreach_element_safe(thread, &threadq->ptq_threadq, wait_links) {
8114 		assert_thread_magic(thread);
8115 		circle_dequeue(&threadq->ptq_threadq, &thread->wait_links);
8116 
8117 		spl_t s = splsched();
8118 		thread_lock(thread);
8119 
8120 		thread_assert_runq_null(thread);
8121 		assert(thread->state & (TH_RUN));
8122 		thread_setrun(thread, SCHED_TAILQ);
8123 
8124 		thread_unlock(thread);
8125 		splx(s);
8126 
8127 		flushed_threads++;
8128 	}
8129 
8130 #if SCHED_HYGIENE_DEBUG
8131 	uint64_t end_time = ml_get_sched_hygiene_timebase();
8132 
8133 	/*
8134 	 * Check for a combination of excess threads and long time,
8135 	 * so that a single thread wakeup that gets stuck is still caught
8136 	 */
8137 	if (waitq_flush_excess_threads && waitq_flush_excess_time_mt &&
8138 	    flushed_threads > waitq_flush_excess_threads &&
8139 	    (end_time - start_time) > waitq_flush_excess_time_mt) {
8140 		/*
8141 		 * Hack alert:
8142 		 *
8143 		 * If there are too many threads here, it can take Too Long
8144 		 * to get through waking up all the threads, leading to
8145 		 * the watchdog going off. Disable the watchdog for this case.
8146 		 *
8147 		 * We only trigger this when seeing a combination of
8148 		 * excess threads and long time, so that a single
8149 		 * thread wakeup that gets stuck is still caught.
8150 		 *
8151 		 * A better story is tracked under rdar://101110793
8152 		 */
8153 		if (ml_get_interrupts_enabled() == false) {
8154 			ml_spin_debug_reset(current_thread());
8155 			ml_irq_debug_abandon();
8156 		}
8157 		abandon_preemption_disable_measurement();
8158 
8159 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_INT_MASKED_RESET),
8160 		    flushed_threads, end_time - start_time);
8161 	}
8162 
8163 #endif /* SCHED_HYGIENE_DEBUG */
8164 
8165 	cpumap_foreach(cpu_id, threadq->ptq_needs_smr_cpu_down) {
8166 		processor_t processor = processor_array[cpu_id];
8167 
8168 		spl_t s = splsched();
8169 		smr_cpu_down(processor, SMR_CPU_REASON_IGNORED);
8170 		splx(s);
8171 	}
8172 	threadq->ptq_needs_smr_cpu_down = 0;
8173 
8174 	assert(circle_queue_empty(&threadq->ptq_threadq));
8175 	assert(threadq->ptq_queue_active == true);
8176 	threadq->ptq_queue_active = false;
8177 
8178 	if (!in_interrupt) {
8179 		/* match the disable from pulled_thread_queue_prepare */
8180 		enable_preemption();
8181 	}
8182 }
8183 
8184 void
pulled_thread_queue_enqueue(struct pulled_thread_queue * threadq,thread_t thread)8185 pulled_thread_queue_enqueue(
8186 	struct pulled_thread_queue *threadq,
8187 	thread_t thread)
8188 {
8189 	assert(threadq == &current_processor()->processor_threadq ||
8190 	    threadq == &current_processor()->processor_threadq_interrupt);
8191 	assert(threadq->ptq_queue_active == true);
8192 	assert(!preemption_enabled());
8193 
8194 	circle_enqueue_tail(&threadq->ptq_threadq, &thread->wait_links);
8195 }
8196 
8197 void
pulled_thread_queue_needs_smr_cpu_down(struct pulled_thread_queue * threadq,int cpu_id)8198 pulled_thread_queue_needs_smr_cpu_down(
8199 	struct pulled_thread_queue *threadq,
8200 	int cpu_id)
8201 {
8202 	assert(threadq == &current_processor()->processor_threadq ||
8203 	    threadq == &current_processor()->processor_threadq_interrupt);
8204 
8205 	assert(threadq->ptq_queue_active == true);
8206 	assert(!preemption_enabled());
8207 
8208 	bit_set(threadq->ptq_needs_smr_cpu_down, cpu_id);
8209 }
8210 
8211 
8212 #if __arm__ || __arm64__
8213 
8214 uint64_t    perfcontrol_failsafe_maintenance_runnable_time;
8215 uint64_t    perfcontrol_failsafe_activation_time;
8216 uint64_t    perfcontrol_failsafe_deactivation_time;
8217 
8218 /* data covering who likely caused it and how long they ran */
8219 #define FAILSAFE_NAME_LEN       33 /* (2*MAXCOMLEN)+1 from size of p_name */
8220 char        perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
8221 int         perfcontrol_failsafe_pid;
8222 uint64_t    perfcontrol_failsafe_tid;
8223 uint64_t    perfcontrol_failsafe_thread_timer_at_start;
8224 uint64_t    perfcontrol_failsafe_thread_timer_last_seen;
8225 uint64_t    perfcontrol_failsafe_recommended_at_trigger;
8226 
8227 /*
8228  * Perf controller calls here to update the recommended core bitmask.
8229  * If the failsafe is active, we don't immediately apply the new value.
8230  * Instead, we store the new request and use it after the failsafe deactivates.
8231  *
8232  * If the failsafe is not active, immediately apply the update.
8233  *
8234  * No scheduler locks are held, no other locks are held that scheduler might depend on,
8235  * interrupts are enabled
8236  *
8237  * currently prototype is in osfmk/arm/machine_routines.h
8238  */
8239 void
sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores,processor_reason_t reason,__unused uint32_t flags)8240 sched_perfcontrol_update_recommended_cores_reason(
8241 	uint64_t                recommended_cores,
8242 	processor_reason_t      reason,
8243 	__unused uint32_t       flags)
8244 {
8245 	assert(preemption_enabled());
8246 
8247 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
8248 
8249 	spl_t s = splsched();
8250 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8251 
8252 	if (reason == REASON_CLPC_SYSTEM) {
8253 		pcs.pcs_requested_recommended_clpc_system = recommended_cores;
8254 	} else {
8255 		assert(reason == REASON_CLPC_USER);
8256 		pcs.pcs_requested_recommended_clpc_user = recommended_cores;
8257 	}
8258 
8259 	pcs.pcs_requested_recommended_clpc = pcs.pcs_requested_recommended_clpc_system &
8260 	    pcs.pcs_requested_recommended_clpc_user;
8261 
8262 	sysctl_sched_recommended_cores = pcs.pcs_requested_recommended_clpc;
8263 
8264 	sched_update_recommended_cores_locked(reason, 0, threadq);
8265 
8266 	simple_unlock(&sched_available_cores_lock);
8267 	splx(s);
8268 
8269 	pulled_thread_queue_flush(threadq);
8270 }
8271 
8272 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)8273 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
8274 {
8275 	sched_perfcontrol_update_recommended_cores_reason(recommended_cores, REASON_CLPC_USER, 0);
8276 }
8277 
8278 /*
8279  * Consider whether we need to activate the recommended cores failsafe
8280  *
8281  * Called from quantum timer interrupt context of a realtime thread
8282  * No scheduler locks are held, interrupts are disabled
8283  */
8284 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)8285 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
8286 {
8287 	assert(ml_get_interrupts_enabled() == false);
8288 
8289 	/*
8290 	 * Check if a realtime thread is starving the system
8291 	 * and bringing up non-recommended cores would help
8292 	 *
8293 	 * TODO: Is this the correct check for recommended == possible cores?
8294 	 * TODO: Validate the checks without the relevant lock are OK.
8295 	 */
8296 
8297 	if (__improbable(pcs.pcs_recommended_clpc_failsafe_active)) {
8298 		/* keep track of how long the responsible thread runs */
8299 		uint64_t cur_th_time = recount_current_thread_time_mach();
8300 
8301 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8302 
8303 		if (pcs.pcs_recommended_clpc_failsafe_active &&
8304 		    cur_thread->thread_id == perfcontrol_failsafe_tid) {
8305 			perfcontrol_failsafe_thread_timer_last_seen = cur_th_time;
8306 		}
8307 
8308 		simple_unlock(&sched_available_cores_lock);
8309 
8310 		/* we're already trying to solve the problem, so bail */
8311 		return;
8312 	}
8313 
8314 	/* The failsafe won't help if there are no more processors to enable */
8315 	if (__probable(bit_count(pcs.pcs_requested_recommended_clpc) >= processor_count)) {
8316 		return;
8317 	}
8318 
8319 	uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
8320 
8321 	/* Use the maintenance thread as our canary in the coal mine */
8322 	thread_t m_thread = sched_maintenance_thread;
8323 
8324 	/* If it doesn't look bad, nothing to see here */
8325 	if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
8326 		return;
8327 	}
8328 
8329 	/* It looks bad, take the lock to be sure */
8330 	thread_lock(m_thread);
8331 
8332 	if (thread_get_runq(m_thread) == PROCESSOR_NULL ||
8333 	    (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
8334 	    m_thread->last_made_runnable_time >= too_long_ago) {
8335 		/*
8336 		 * Maintenance thread is either on cpu or blocked, and
8337 		 * therefore wouldn't benefit from more cores
8338 		 */
8339 		thread_unlock(m_thread);
8340 		return;
8341 	}
8342 
8343 	uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
8344 
8345 	thread_unlock(m_thread);
8346 
8347 	/*
8348 	 * There are cores disabled at perfcontrol's recommendation, but the
8349 	 * system is so overloaded that the maintenance thread can't run.
8350 	 * That likely means that perfcontrol can't run either, so it can't fix
8351 	 * the recommendation.  We have to kick in a failsafe to keep from starving.
8352 	 *
8353 	 * When the maintenance thread has been starved for too long,
8354 	 * ignore the recommendation from perfcontrol and light up all the cores.
8355 	 *
8356 	 * TODO: Consider weird states like boot, sleep, or debugger
8357 	 */
8358 
8359 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
8360 
8361 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8362 
8363 	if (pcs.pcs_recommended_clpc_failsafe_active) {
8364 		simple_unlock(&sched_available_cores_lock);
8365 		pulled_thread_queue_flush(threadq);
8366 		return;
8367 	}
8368 
8369 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
8370 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
8371 	    pcs.pcs_requested_recommended_clpc, maintenance_runnable_time, 0, 0, 0);
8372 
8373 	pcs.pcs_recommended_clpc_failsafe_active = true;
8374 	perfcontrol_failsafe_activation_time = mach_absolute_time();
8375 	perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
8376 	perfcontrol_failsafe_recommended_at_trigger = pcs.pcs_requested_recommended_clpc;
8377 
8378 	/* Capture some data about who screwed up (assuming that the thread on core is at fault) */
8379 	task_t task = get_threadtask(cur_thread);
8380 	perfcontrol_failsafe_pid = task_pid(task);
8381 	strlcpy(perfcontrol_failsafe_name, proc_name_address(get_bsdtask_info(task)), sizeof(perfcontrol_failsafe_name));
8382 
8383 	perfcontrol_failsafe_tid = cur_thread->thread_id;
8384 
8385 	/* Blame the thread for time it has run recently */
8386 	uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
8387 
8388 	uint64_t last_seen = recount_current_thread_time_mach();
8389 
8390 	/* Compute the start time of the bad behavior in terms of the thread's on core time */
8391 	perfcontrol_failsafe_thread_timer_at_start  = last_seen - recent_computation;
8392 	perfcontrol_failsafe_thread_timer_last_seen = last_seen;
8393 
8394 	/* Publish the pcs_recommended_clpc_failsafe_active override to the CPUs */
8395 	sched_update_recommended_cores_locked(REASON_SYSTEM, 0, threadq);
8396 
8397 	simple_unlock(&sched_available_cores_lock);
8398 
8399 	pulled_thread_queue_flush(threadq);
8400 }
8401 
8402 /*
8403  * Now that our bacon has been saved by the failsafe, consider whether to turn it off
8404  *
8405  * Runs in the context of the maintenance thread, no locks held
8406  */
8407 static void
sched_recommended_cores_maintenance(void)8408 sched_recommended_cores_maintenance(void)
8409 {
8410 	/* Common case - no failsafe, nothing to be done here */
8411 	if (__probable(!pcs.pcs_recommended_clpc_failsafe_active)) {
8412 		return;
8413 	}
8414 
8415 	uint64_t ctime = mach_absolute_time();
8416 
8417 	boolean_t print_diagnostic = FALSE;
8418 	char p_name[FAILSAFE_NAME_LEN] = "";
8419 
8420 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
8421 
8422 	spl_t s = splsched();
8423 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8424 
8425 	/* Check again, under the lock, to avoid races */
8426 	if (!pcs.pcs_recommended_clpc_failsafe_active) {
8427 		goto out;
8428 	}
8429 
8430 	/*
8431 	 * Ensure that the other cores get another few ticks to run some threads
8432 	 * If we don't have this hysteresis, the maintenance thread is the first
8433 	 * to run, and then it immediately kills the other cores
8434 	 */
8435 	if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
8436 		goto out;
8437 	}
8438 
8439 	/* Capture some diagnostic state under the lock so we can print it out later */
8440 
8441 	int      pid = perfcontrol_failsafe_pid;
8442 	uint64_t tid = perfcontrol_failsafe_tid;
8443 
8444 	uint64_t thread_usage       = perfcontrol_failsafe_thread_timer_last_seen -
8445 	    perfcontrol_failsafe_thread_timer_at_start;
8446 	uint64_t rec_cores_before   = perfcontrol_failsafe_recommended_at_trigger;
8447 	uint64_t rec_cores_after    = pcs.pcs_requested_recommended_clpc;
8448 	uint64_t failsafe_duration  = ctime - perfcontrol_failsafe_activation_time;
8449 	strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
8450 
8451 	print_diagnostic = TRUE;
8452 
8453 	/* Deactivate the failsafe and reinstate the requested recommendation settings */
8454 
8455 	perfcontrol_failsafe_deactivation_time = ctime;
8456 	pcs.pcs_recommended_clpc_failsafe_active = false;
8457 
8458 	sched_update_recommended_cores_locked(REASON_SYSTEM, 0, threadq);
8459 
8460 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
8461 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
8462 	    pcs.pcs_requested_recommended_clpc, failsafe_duration, 0, 0, 0);
8463 
8464 out:
8465 	simple_unlock(&sched_available_cores_lock);
8466 	splx(s);
8467 
8468 	pulled_thread_queue_flush(threadq);
8469 
8470 	if (print_diagnostic) {
8471 		uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
8472 
8473 		absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
8474 		failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
8475 
8476 		absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
8477 		thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
8478 
8479 		printf("recommended core failsafe kicked in for %lld ms "
8480 		    "likely due to %s[%d] thread 0x%llx spending "
8481 		    "%lld ms on cpu at realtime priority - "
8482 		    "new recommendation: 0x%llx -> 0x%llx\n",
8483 		    failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
8484 		    rec_cores_before, rec_cores_after);
8485 	}
8486 }
8487 
8488 #endif /* __arm64__ */
8489 
8490 /*
8491  * This is true before we have jumped to kernel_bootstrap_thread
8492  * first thread context during boot, or while all processors
8493  * have offlined during system sleep and the scheduler is disabled.
8494  *
8495  * (Note: only ever true on ARM, Intel doesn't actually offline the last CPU)
8496  */
8497 bool
sched_all_cpus_offline(void)8498 sched_all_cpus_offline(void)
8499 {
8500 	return pcs.pcs_effective.pcs_online_cores == 0;
8501 }
8502 
8503 void
sched_assert_not_last_online_cpu(__assert_only int cpu_id)8504 sched_assert_not_last_online_cpu(__assert_only int cpu_id)
8505 {
8506 	assertf(pcs.pcs_effective.pcs_online_cores != BIT(cpu_id),
8507 	    "attempting to shut down the last online CPU!");
8508 }
8509 
8510 /*
8511  * This is the unified single function to change published active core counts based on processor mode.
8512  * Each type of flag affects the other in terms of how the counts change.
8513  *
8514  * Future: Add support for not decrementing counts in 'temporary derecommended online' mode
8515  * Future: Shutdown for system sleep should be 'temporary' according to the user counts
8516  * so that no client sees a transiently low number of CPUs.
8517  */
8518 void
sched_processor_change_mode_locked(processor_t processor,processor_mode_t pcm_mode,bool set)8519 sched_processor_change_mode_locked(processor_t processor, processor_mode_t pcm_mode, bool set)
8520 {
8521 	simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
8522 	pset_assert_locked(processor->processor_set);
8523 
8524 	switch (pcm_mode) {
8525 	case PCM_RECOMMENDED:
8526 		if (set) {
8527 			assert(!processor->is_recommended);
8528 			assert(!bit_test(pcs.pcs_recommended_cores, processor->cpu_id));
8529 
8530 			processor->is_recommended = true;
8531 			bit_set(pcs.pcs_recommended_cores, processor->cpu_id);
8532 
8533 			if (processor->processor_online) {
8534 				os_atomic_inc(&processor_avail_count_user, relaxed);
8535 #if CONFIG_SCHED_SMT
8536 				if (processor->processor_primary == processor) {
8537 					os_atomic_inc(&primary_processor_avail_count_user, relaxed);
8538 				}
8539 #endif /* CONFIG_SCHED_SMT */
8540 			}
8541 		} else {
8542 			assert(processor->is_recommended);
8543 			assert(bit_test(pcs.pcs_recommended_cores, processor->cpu_id));
8544 
8545 			processor->is_recommended = false;
8546 			bit_clear(pcs.pcs_recommended_cores, processor->cpu_id);
8547 
8548 			if (processor->processor_online) {
8549 				os_atomic_dec(&processor_avail_count_user, relaxed);
8550 #if CONFIG_SCHED_SMT
8551 				if (processor->processor_primary == processor) {
8552 					os_atomic_dec(&primary_processor_avail_count_user, relaxed);
8553 				}
8554 #endif /* CONFIG_SCHED_SMT */
8555 			}
8556 		}
8557 		break;
8558 	case PCM_TEMPORARY:
8559 		if (set) {
8560 			assert(!processor->shutdown_temporary);
8561 			assert(!bit_test(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id));
8562 
8563 			processor->shutdown_temporary = true;
8564 			bit_set(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id);
8565 
8566 			if (!processor->processor_online) {
8567 				goto counts_up;
8568 			}
8569 		} else {
8570 			assert(processor->shutdown_temporary);
8571 			assert(bit_test(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id));
8572 
8573 			processor->shutdown_temporary = false;
8574 			bit_clear(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id);
8575 
8576 			if (!processor->processor_online) {
8577 				goto counts_down;
8578 			}
8579 		}
8580 		break;
8581 	case PCM_ONLINE:
8582 		if (set) {
8583 			assert(!processor->processor_online);
8584 			assert(!bit_test(pcs.pcs_effective.pcs_online_cores, processor->cpu_id));
8585 			processor->processor_online = true;
8586 			bit_set(pcs.pcs_effective.pcs_online_cores, processor->cpu_id);
8587 
8588 			if (!processor->shutdown_temporary) {
8589 				goto counts_up;
8590 			}
8591 		} else {
8592 			assert(processor->processor_online);
8593 			assert(bit_test(pcs.pcs_effective.pcs_online_cores, processor->cpu_id));
8594 			processor->processor_online = false;
8595 			bit_clear(pcs.pcs_effective.pcs_online_cores, processor->cpu_id);
8596 
8597 			if (!processor->shutdown_temporary) {
8598 				goto counts_down;
8599 			}
8600 		}
8601 		break;
8602 	default:
8603 		panic("unknown mode %d", pcm_mode);
8604 	}
8605 
8606 	return;
8607 
8608 counts_up:
8609 	ml_cpu_up_update_counts(processor->cpu_id);
8610 
8611 	os_atomic_inc(&processor_avail_count, relaxed);
8612 
8613 	if (processor->is_recommended) {
8614 		os_atomic_inc(&processor_avail_count_user, relaxed);
8615 #if CONFIG_SCHED_SMT
8616 		if (processor->processor_primary == processor) {
8617 			os_atomic_inc(&primary_processor_avail_count_user, relaxed);
8618 		}
8619 #endif /* CONFIG_SCHED_SMT */
8620 	}
8621 	commpage_update_active_cpus();
8622 
8623 	return;
8624 
8625 counts_down:
8626 	ml_cpu_down_update_counts(processor->cpu_id);
8627 
8628 	os_atomic_dec(&processor_avail_count, relaxed);
8629 
8630 	if (processor->is_recommended) {
8631 		os_atomic_dec(&processor_avail_count_user, relaxed);
8632 #if CONFIG_SCHED_SMT
8633 		if (processor->processor_primary == processor) {
8634 			os_atomic_dec(&primary_processor_avail_count_user, relaxed);
8635 		}
8636 #endif /* CONFIG_SCHED_SMT */
8637 	}
8638 	commpage_update_active_cpus();
8639 
8640 	return;
8641 }
8642 
8643 bool
sched_mark_processor_online(processor_t processor,__assert_only processor_reason_t reason)8644 sched_mark_processor_online(processor_t processor, __assert_only processor_reason_t reason)
8645 {
8646 	assert(processor == current_processor());
8647 
8648 	processor_set_t pset = processor->processor_set;
8649 
8650 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
8651 
8652 	spl_t s = splsched();
8653 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8654 	pset_lock(pset);
8655 
8656 	/* Boot CPU coming online for the first time, either at boot or after sleep */
8657 	bool is_first_online_processor = sched_all_cpus_offline();
8658 	if (is_first_online_processor) {
8659 		assert(processor == master_processor);
8660 	}
8661 
8662 	assert((processor != master_processor) || (reason == REASON_SYSTEM) || support_bootcpu_shutdown);
8663 
8664 	sched_processor_change_mode_locked(processor, PCM_ONLINE, true);
8665 
8666 	assert(processor->processor_offline_state == PROCESSOR_OFFLINE_STARTING ||
8667 	    processor->processor_offline_state == PROCESSOR_OFFLINE_STARTED_NOT_RUNNING ||
8668 	    processor->processor_offline_state == PROCESSOR_OFFLINE_FINAL_SYSTEM_SLEEP);
8669 
8670 	processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_STARTED_NOT_WAITED);
8671 
8672 	++pset->online_processor_count;
8673 	/* We have to mark the processor as RUNNING and not DISPATCHING because
8674 	 * in the thread_select() path, we assert that IDLE | DISPATCHING implies
8675 	 * running on the idle thread, which is not true at boot.
8676 	 * <rdar://156413254> */
8677 	pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
8678 
8679 	if (processor->is_recommended) {
8680 		SCHED(pset_made_schedulable)(pset);
8681 	}
8682 
8683 	SCHED(update_pset_load_average)(pset, 0);
8684 	pset_update_rt_stealable_state(pset);
8685 
8686 	pset_unlock(pset);
8687 
8688 	smr_cpu_up(processor, SMR_CPU_REASON_OFFLINE);
8689 
8690 	simple_unlock(&sched_available_cores_lock);
8691 	splx(s);
8692 
8693 	pulled_thread_queue_flush(threadq);
8694 
8695 	return is_first_online_processor;
8696 }
8697 
8698 void
sched_mark_processor_offline(processor_t processor,bool is_final_system_sleep)8699 sched_mark_processor_offline(processor_t processor, bool is_final_system_sleep)
8700 {
8701 	assert(processor == current_processor());
8702 
8703 	struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
8704 
8705 	processor_set_t pset = processor->processor_set;
8706 
8707 	spl_t s = splsched();
8708 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8709 
8710 	assert(bit_test(pcs.pcs_effective.pcs_online_cores, processor->cpu_id));
8711 	assert(processor->processor_offline_state == PROCESSOR_OFFLINE_BEGIN_SHUTDOWN);
8712 
8713 	if (!is_final_system_sleep) {
8714 		/*
8715 		 * We can't shut down the last available core!
8716 		 * Force recommend another CPU if this is the last one.
8717 		 */
8718 
8719 		if ((pcs.pcs_effective.pcs_online_cores & pcs.pcs_recommended_cores) == BIT(processor->cpu_id)) {
8720 			sched_update_recommended_cores_locked(REASON_SYSTEM, BIT(processor->cpu_id), threadq);
8721 		}
8722 
8723 		/* If we're still the last one, something went wrong. */
8724 		if ((pcs.pcs_effective.pcs_online_cores & pcs.pcs_recommended_cores) == BIT(processor->cpu_id)) {
8725 			panic("shutting down the last available core! online: 0x%llx rec: 0x%llxx",
8726 			    pcs.pcs_effective.pcs_online_cores,
8727 			    pcs.pcs_recommended_cores);
8728 		}
8729 	}
8730 
8731 	pset_lock(pset);
8732 	assert(processor->state == PROCESSOR_RUNNING);
8733 	assert(processor->processor_inshutdown);
8734 	pset_update_processor_state(pset, processor, PROCESSOR_PENDING_OFFLINE);
8735 	--pset->online_processor_count;
8736 
8737 	sched_processor_change_mode_locked(processor, PCM_ONLINE, false);
8738 
8739 	if (is_final_system_sleep) {
8740 		assert3u(pcs.pcs_effective.pcs_online_cores, ==, 0);
8741 		assert(processor == master_processor);
8742 		assert(sched_all_cpus_offline());
8743 
8744 		processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_FINAL_SYSTEM_SLEEP);
8745 	} else {
8746 		processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_PENDING_OFFLINE);
8747 	}
8748 
8749 	simple_unlock(&sched_available_cores_lock);
8750 
8751 	SCHED(processor_queue_shutdown)(processor, threadq);
8752 	/* pset lock dropped */
8753 	SCHED(rt_queue_shutdown)(processor, threadq);
8754 
8755 	splx(s);
8756 
8757 	pulled_thread_queue_flush(threadq);
8758 }
8759 
8760 /*
8761  * Apply a new recommended cores mask to the processors it affects
8762  * Runs after considering failsafes and such
8763  *
8764  * Iterate over processors and update their ->is_recommended field.
8765  * If a processor is running, we let it drain out at its next
8766  * quantum expiration or blocking point. If a processor is idle, there
8767  * may be more work for it to do, so IPI it.
8768  *
8769  * interrupts disabled, sched_available_cores_lock is held
8770  *
8771  * If a core is about to go offline, its bit will be set in core_going_offline,
8772  * so we can make sure not to pick it as the last resort cpu.
8773  */
8774 static void
sched_update_recommended_cores_locked(processor_reason_t reason,cpumap_t core_going_offline,struct pulled_thread_queue * threadq)8775 sched_update_recommended_cores_locked(
8776 	processor_reason_t reason,
8777 	cpumap_t core_going_offline,
8778 	struct pulled_thread_queue *threadq)
8779 {
8780 	simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
8781 
8782 	cpumap_t recommended_cores = pcs.pcs_requested_recommended_clpc;
8783 
8784 	if (pcs.pcs_init_completed) {
8785 		recommended_cores &= pcs.pcs_effective.pcs_powerdown_recommended_cores;
8786 	}
8787 
8788 	if (pcs.pcs_sleep_override_recommended || pcs.pcs_recommended_clpc_failsafe_active) {
8789 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
8790 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
8791 		    recommended_cores,
8792 		    sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
8793 
8794 		recommended_cores = pcs.pcs_managed_cores;
8795 	}
8796 
8797 	if (bit_count(recommended_cores & pcs.pcs_effective.pcs_online_cores & ~core_going_offline) == 0) {
8798 		/*
8799 		 * If there are no online cpus recommended,
8800 		 * then the system will make no forward progress.
8801 		 * Pick a CPU of last resort to avoid hanging.
8802 		 */
8803 		int last_resort;
8804 
8805 		if (!support_bootcpu_shutdown) {
8806 			/* We know the master_processor is always available */
8807 			last_resort = master_processor->cpu_id;
8808 		} else {
8809 			/* Pick some still-online processor to be the processor of last resort */
8810 			last_resort = lsb_first(pcs.pcs_effective.pcs_online_cores & ~core_going_offline);
8811 
8812 			if (last_resort == -1) {
8813 				panic("%s> no last resort cpu found: 0x%llx 0x%llx",
8814 				    __func__, pcs.pcs_effective.pcs_online_cores, core_going_offline);
8815 			}
8816 		}
8817 
8818 		bit_set(recommended_cores, last_resort);
8819 	}
8820 
8821 	if (pcs.pcs_recommended_cores == recommended_cores) {
8822 		/* Nothing to do */
8823 		return;
8824 	}
8825 
8826 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) |
8827 	    DBG_FUNC_START,
8828 	    recommended_cores,
8829 	    pcs.pcs_recommended_clpc_failsafe_active, pcs.pcs_sleep_override_recommended, 0);
8830 
8831 	cpumap_t needs_exit_idle_mask = 0x0;
8832 
8833 	/* First set recommended cores */
8834 	foreach_node(node) {
8835 		foreach_pset_id(pset_id, node) {
8836 			processor_set_t pset = pset_for_id((pset_id_t)pset_id);
8837 
8838 			cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
8839 			cpumap_t newly_recommended = changed_recommendations & recommended_cores;
8840 
8841 			if (newly_recommended == 0) {
8842 				/* Nothing to do */
8843 				continue;
8844 			}
8845 
8846 			pset_lock(pset);
8847 
8848 			cpumap_foreach(cpu_id, newly_recommended) {
8849 				processor_t processor = processor_array[cpu_id];
8850 
8851 				sched_processor_change_mode_locked(processor, PCM_RECOMMENDED, true);
8852 
8853 				processor->last_recommend_reason = reason;
8854 
8855 				if (pset->recommended_bitmask == 0) {
8856 					/* Cluster is becoming available for scheduling */
8857 					atomic_bit_set(&pset->node->pset_recommended_map, pset->pset_id, memory_order_relaxed);
8858 				}
8859 				bit_set(pset->recommended_bitmask, processor->cpu_id);
8860 
8861 				if (processor->state == PROCESSOR_IDLE) {
8862 					if (processor != current_processor()) {
8863 						bit_set(needs_exit_idle_mask, processor->cpu_id);
8864 					}
8865 					/* Set the processor to DISPATCHING so that it exits the idle loop. */
8866 					pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
8867 				}
8868 
8869 				if (processor->processor_online) {
8870 					SCHED(pset_made_schedulable)(pset);
8871 				}
8872 			}
8873 			SCHED(update_pset_load_average)(pset, 0);
8874 			pset_update_rt_stealable_state(pset);
8875 
8876 			pset_unlock(pset);
8877 
8878 			cpumap_foreach(cpu_id, newly_recommended) {
8879 				smr_cpu_up(processor_array[cpu_id],
8880 				    SMR_CPU_REASON_IGNORED);
8881 			}
8882 		}
8883 	}
8884 
8885 	/* Now shutdown not recommended cores */
8886 	foreach_node(node) {
8887 		foreach_pset_id(pset_id, node) {
8888 			processor_set_t pset = pset_array[pset_id];
8889 
8890 			cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
8891 			cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
8892 
8893 			if (newly_unrecommended == 0) {
8894 				/* Nothing to do */
8895 				continue;
8896 			}
8897 
8898 			cpumap_foreach(cpu_id, newly_unrecommended) {
8899 				processor_t processor = processor_array[cpu_id];
8900 				sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
8901 
8902 				pset_lock(pset);
8903 
8904 				sched_processor_change_mode_locked(processor, PCM_RECOMMENDED, false);
8905 
8906 				if (reason != REASON_NONE) {
8907 					processor->last_derecommend_reason = reason;
8908 				}
8909 				bit_clear(pset->recommended_bitmask, processor->cpu_id);
8910 				pset_update_rt_stealable_state(pset);
8911 				if (pset->recommended_bitmask == 0) {
8912 					/* Cluster is becoming unavailable for scheduling */
8913 					atomic_bit_clear(&pset->node->pset_recommended_map, pset->pset_id, memory_order_relaxed);
8914 				}
8915 
8916 				if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
8917 					ipi_type = SCHED_IPI_IMMEDIATE;
8918 				}
8919 				SCHED(processor_queue_shutdown)(processor, threadq);
8920 				/* pset unlocked */
8921 
8922 				SCHED(rt_queue_shutdown)(processor, threadq);
8923 
8924 				if (ipi_type == SCHED_IPI_NONE) {
8925 					/*
8926 					 * If the core is idle,
8927 					 * we can directly mark the processor
8928 					 * as "Ignored"
8929 					 *
8930 					 * Otherwise, SMR will detect this
8931 					 * during smr_cpu_leave() when the
8932 					 * processor actually idles.
8933 					 *
8934 					 * Because smr_cpu_down issues thread
8935 					 * wakeups, and we're currently under the
8936 					 * sched_available_cores_lock, we have
8937 					 * to defer it to the flush phase.
8938 					 *
8939 					 * SMR double checks the processor's
8940 					 * is_recommended field under its lock,
8941 					 * so it's safe for this to be called
8942 					 * outside the lock and potentially in
8943 					 * the wrong order vs smr_cpu_up.
8944 					 */
8945 					pulled_thread_queue_needs_smr_cpu_down(threadq, cpu_id);
8946 				} else if (processor == current_processor()) {
8947 					ast_on(AST_PREEMPT);
8948 				} else {
8949 					sched_ipi_perform(processor, ipi_type);
8950 				}
8951 			}
8952 		}
8953 	}
8954 
8955 	if (pcs.pcs_init_completed) {
8956 		assert3u(pcs.pcs_recommended_cores, ==, recommended_cores);
8957 	}
8958 
8959 #if defined(__x86_64__)
8960 	commpage_update_active_cpus();
8961 #endif
8962 	/* Issue all pending IPIs now that the pset lock has been dropped */
8963 	cpumap_foreach(cpu_id, needs_exit_idle_mask) {
8964 		processor_t processor = processor_array[cpu_id];
8965 		machine_signal_idle(processor);
8966 	}
8967 
8968 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
8969 	    needs_exit_idle_mask, 0, 0, 0);
8970 }
8971 
8972 /*
8973  * Enters with the available cores lock held, returns with it held, but will drop it in the meantime.
8974  * Enters with the cluster_powerdown_lock held, returns with it held, keeps it held.
8975  * Flushes the provided threadq, and returns a different one that needs flushing.
8976  */
8977 static __result_use_check struct pulled_thread_queue *
sched_update_powered_cores_drops_lock(processor_reason_t requested_reason,spl_t caller_s,struct pulled_thread_queue * threadq)8978 sched_update_powered_cores_drops_lock(
8979 	processor_reason_t requested_reason,
8980 	spl_t caller_s,
8981 	struct pulled_thread_queue *threadq)
8982 {
8983 	lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
8984 	simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
8985 
8986 	assert(ml_get_interrupts_enabled() == false);
8987 	assert(caller_s == true); /* Caller must have had interrupts enabled when they took the lock */
8988 
8989 	/* All transitions should be quiesced before we start changing things */
8990 	assert_no_processors_in_transition_locked();
8991 
8992 	pcs.pcs_in_flight_reason = requested_reason;
8993 
8994 	struct powered_cores_state requested = sched_compute_requested_powered_cores();
8995 	struct powered_cores_state effective = pcs.pcs_effective;
8996 
8997 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_START,
8998 	    requested.pcs_online_cores, requested_reason, 0, effective.pcs_online_cores);
8999 
9000 	/* The bits that are different and in the new value */
9001 	cpumap_t newly_online_cores = (requested.pcs_online_cores ^
9002 	    effective.pcs_online_cores) & requested.pcs_online_cores;
9003 
9004 	/* The bits that are different and are not in the new value */
9005 	cpumap_t newly_offline_cores = (requested.pcs_online_cores ^
9006 	    effective.pcs_online_cores) & ~requested.pcs_online_cores;
9007 
9008 	cpumap_t newly_recommended_cores = (requested.pcs_powerdown_recommended_cores ^
9009 	    effective.pcs_powerdown_recommended_cores) & requested.pcs_powerdown_recommended_cores;
9010 
9011 	cpumap_t newly_derecommended_cores = (requested.pcs_powerdown_recommended_cores ^
9012 	    effective.pcs_powerdown_recommended_cores) & ~requested.pcs_powerdown_recommended_cores;
9013 
9014 	cpumap_t newly_temporary_cores = (requested.pcs_tempdown_cores ^
9015 	    effective.pcs_tempdown_cores) & requested.pcs_tempdown_cores;
9016 
9017 	cpumap_t newly_nontemporary_cores = (requested.pcs_tempdown_cores ^
9018 	    effective.pcs_tempdown_cores) & ~requested.pcs_tempdown_cores;
9019 
9020 	/*
9021 	 * Newly online and derecommended cores should be derecommended
9022 	 * before powering them up, so they never run around doing stuff
9023 	 * before we reach the end of this function.
9024 	 */
9025 
9026 	cpumap_t newly_online_and_derecommended = newly_online_cores & newly_derecommended_cores;
9027 
9028 	/*
9029 	 * Publish the goal state we're working on achieving.
9030 	 * At the end of this function, pcs_effective will match this.
9031 	 */
9032 	pcs.pcs_requested = requested;
9033 
9034 	pcs.pcs_effective.pcs_powerdown_recommended_cores |= newly_recommended_cores;
9035 	pcs.pcs_effective.pcs_powerdown_recommended_cores &= ~newly_online_and_derecommended;
9036 
9037 	sched_update_recommended_cores_locked(requested_reason, 0, threadq);
9038 
9039 	simple_unlock(&sched_available_cores_lock);
9040 	splx(caller_s);
9041 
9042 	pulled_thread_queue_flush(threadq);
9043 	/* a new threadq must be prepared again before use */
9044 
9045 	assert(ml_get_interrupts_enabled() == true);
9046 	assert(preemption_enabled());
9047 
9048 	/* First set powered cores */
9049 	cpumap_t started_cores = 0ull;
9050 	foreach_node(node) {
9051 		foreach_pset_id(pset_id, node) {
9052 			processor_set_t pset = pset_array[pset_id];
9053 
9054 			spl_t s = splsched();
9055 			pset_lock(pset);
9056 			cpumap_t pset_newly_online = newly_online_cores & pset->cpu_bitmask;
9057 
9058 			__assert_only cpumap_t pset_online_cores =
9059 			    pset->cpu_state_map[PROCESSOR_START] |
9060 			    pset->cpu_state_map[PROCESSOR_IDLE] |
9061 			    pset->cpu_state_map[PROCESSOR_DISPATCHING] |
9062 			    pset->cpu_state_map[PROCESSOR_RUNNING];
9063 			assert((pset_online_cores & pset_newly_online) == 0);
9064 
9065 			pset_unlock(pset);
9066 			splx(s);
9067 
9068 			if (pset_newly_online == 0) {
9069 				/* Nothing to do */
9070 				continue;
9071 			}
9072 			cpumap_foreach(cpu_id, pset_newly_online) {
9073 				processor_start_reason(processor_array[cpu_id], requested_reason);
9074 				bit_set(started_cores, cpu_id);
9075 			}
9076 		}
9077 	}
9078 
9079 	/*
9080 	 * Wait for processors to finish starting in parallel.
9081 	 * We never proceed until all newly started processors have finished.
9082 	 *
9083 	 * This has the side effect of closing the ml_cpu_up_processors race,
9084 	 * as all started CPUs must have SIGPdisabled cleared by the time this
9085 	 * is satisfied. (rdar://124631843)
9086 	 */
9087 	cpumap_foreach(cpu_id, started_cores) {
9088 		processor_wait_for_start(processor_array[cpu_id], PROCESSOR_POWERED_CORES_CHANGE);
9089 	}
9090 
9091 	/*
9092 	 * Update published counts of processors to match new temporary status
9093 	 * Publish all temporary before nontemporary, so that any readers that
9094 	 * see a middle state will see a slightly too high count instead of
9095 	 * ending up seeing a 0 (because that crashes dispatch_apply, ask
9096 	 * me how I know)
9097 	 */
9098 
9099 	spl_t s;
9100 	s = splsched();
9101 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
9102 
9103 	foreach_node(node) {
9104 		foreach_pset_id(pset_id, node) {
9105 			processor_set_t pset = pset_array[pset_id];
9106 
9107 			pset_lock(pset);
9108 
9109 			cpumap_t pset_newly_temporary = newly_temporary_cores & pset->cpu_bitmask;
9110 
9111 			cpumap_foreach(cpu_id, pset_newly_temporary) {
9112 				sched_processor_change_mode_locked(processor_array[cpu_id],
9113 				    PCM_TEMPORARY, true);
9114 			}
9115 
9116 			pset_unlock(pset);
9117 		}
9118 	}
9119 
9120 	foreach_node(node) {
9121 		foreach_pset_id(pset_id, node) {
9122 			processor_set_t pset = pset_array[pset_id];
9123 
9124 			pset_lock(pset);
9125 
9126 			cpumap_t pset_newly_nontemporary = newly_nontemporary_cores & pset->cpu_bitmask;
9127 
9128 			cpumap_foreach(cpu_id, pset_newly_nontemporary) {
9129 				sched_processor_change_mode_locked(processor_array[cpu_id],
9130 				    PCM_TEMPORARY, false);
9131 			}
9132 
9133 			pset_unlock(pset);
9134 		}
9135 	}
9136 
9137 	simple_unlock(&sched_available_cores_lock);
9138 	splx(s);
9139 
9140 	/* Now shutdown not powered cores */
9141 	foreach_node(node) {
9142 		foreach_pset_id(pset_id, node) {
9143 			processor_set_t pset = pset_array[pset_id];
9144 
9145 			s = splsched();
9146 			pset_lock(pset);
9147 
9148 			cpumap_t pset_newly_offline = newly_offline_cores & pset->cpu_bitmask;
9149 			__assert_only cpumap_t pset_powered_cores =
9150 			    pset->cpu_state_map[PROCESSOR_START] |
9151 			    pset->cpu_state_map[PROCESSOR_IDLE] |
9152 			    pset->cpu_state_map[PROCESSOR_DISPATCHING] |
9153 			    pset->cpu_state_map[PROCESSOR_RUNNING];
9154 			assert((pset_powered_cores & pset_newly_offline) == pset_newly_offline);
9155 
9156 			pset_unlock(pset);
9157 			splx(s);
9158 
9159 			if (pset_newly_offline == 0) {
9160 				/* Nothing to do */
9161 				continue;
9162 			}
9163 
9164 			cpumap_foreach(cpu_id, pset_newly_offline) {
9165 				processor_exit_reason(processor_array[cpu_id], requested_reason, false);
9166 			}
9167 		}
9168 	}
9169 
9170 	assert(ml_get_interrupts_enabled() == true);
9171 	assert(preemption_enabled());
9172 
9173 	threadq = pulled_thread_queue_prepare();
9174 
9175 	s = splsched();
9176 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
9177 
9178 	assert(s == caller_s);
9179 
9180 	pcs.pcs_effective.pcs_powerdown_recommended_cores &= ~newly_derecommended_cores;
9181 
9182 	sched_update_recommended_cores_locked(requested_reason, 0, threadq);
9183 
9184 	pcs.pcs_previous_reason = requested_reason;
9185 
9186 	/* All transitions should be quiesced now that we are done changing things */
9187 	assert_no_processors_in_transition_locked();
9188 
9189 	assert3u(pcs.pcs_requested.pcs_online_cores, ==, pcs.pcs_effective.pcs_online_cores);
9190 	assert3u(pcs.pcs_requested.pcs_tempdown_cores, ==, pcs.pcs_effective.pcs_tempdown_cores);
9191 	assert3u(pcs.pcs_requested.pcs_powerdown_recommended_cores, ==, pcs.pcs_effective.pcs_powerdown_recommended_cores);
9192 
9193 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_END, 0, 0, 0, 0);
9194 	return threadq;
9195 }
9196 
9197 void
thread_set_options(uint32_t thopt)9198 thread_set_options(uint32_t thopt)
9199 {
9200 	spl_t x;
9201 	thread_t t = current_thread();
9202 
9203 	x = splsched();
9204 	thread_lock(t);
9205 
9206 	t->options |= thopt;
9207 
9208 	thread_unlock(t);
9209 	splx(x);
9210 }
9211 
9212 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)9213 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
9214 {
9215 	thread->pending_block_hint = block_hint;
9216 }
9217 
9218 uint32_t
qos_max_parallelism(int qos,uint64_t options)9219 qos_max_parallelism(int qos, uint64_t options)
9220 {
9221 	return SCHED(qos_max_parallelism)(qos, options);
9222 }
9223 
9224 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)9225 sched_qos_max_parallelism(__unused int qos, uint64_t options)
9226 {
9227 	host_basic_info_data_t hinfo;
9228 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
9229 
9230 
9231 	/*
9232 	 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
9233 	 * implement their own qos_max_parallelism() interfaces.
9234 	 */
9235 	assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
9236 
9237 	/* Query the machine layer for core information */
9238 	__assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
9239 	    (host_info_t)&hinfo, &count);
9240 	assert(kret == KERN_SUCCESS);
9241 
9242 	if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
9243 		return hinfo.logical_cpu;
9244 	} else {
9245 		return hinfo.physical_cpu;
9246 	}
9247 }
9248 
9249 int sched_allow_NO_SMT_threads = 1;
9250 #if CONFIG_SCHED_SMT
9251 bool
thread_no_smt(thread_t thread)9252 thread_no_smt(thread_t thread)
9253 {
9254 	return sched_allow_NO_SMT_threads &&
9255 	       (thread->bound_processor == PROCESSOR_NULL) &&
9256 	       ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
9257 }
9258 
9259 bool
processor_active_thread_no_smt(processor_t processor)9260 processor_active_thread_no_smt(processor_t processor)
9261 {
9262 	return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
9263 }
9264 #endif /* CONFIG_SCHED_SMT */
9265 
9266 #if __arm64__
9267 
9268 /*
9269  * Set up or replace old timer with new timer
9270  *
9271  * Returns true if canceled old timer, false if it did not
9272  */
9273 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)9274 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
9275 {
9276 	/*
9277 	 * Exchange deadline for new deadline, if old deadline was nonzero,
9278 	 * then I cancelled the callback, otherwise I didn't
9279 	 */
9280 
9281 	return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
9282 	           relaxed) != 0;
9283 }
9284 
9285 /*
9286  * Set global SFI window (in usec)
9287  */
9288 kern_return_t
sched_perfcontrol_sfi_set_window(uint64_t window_usecs)9289 sched_perfcontrol_sfi_set_window(uint64_t window_usecs)
9290 {
9291 	kern_return_t ret = KERN_NOT_SUPPORTED;
9292 #if CONFIG_THREAD_GROUPS
9293 	if (window_usecs == 0ULL) {
9294 		ret = sfi_window_cancel();
9295 	} else {
9296 		ret = sfi_set_window(window_usecs);
9297 	}
9298 #endif // CONFIG_THREAD_GROUPS
9299 	return ret;
9300 }
9301 
9302 /*
9303  * Set background / maintenance / mitigation SFI class offtimes
9304  */
9305 kern_return_t
sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)9306 sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)
9307 {
9308 	kern_return_t ret = KERN_NOT_SUPPORTED;
9309 #if CONFIG_THREAD_GROUPS
9310 	if (offtime_usecs == 0ULL) {
9311 		ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE);
9312 		ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG);
9313 		ret |= sfi_class_offtime_cancel(SFI_CLASS_RUNAWAY_MITIGATION);
9314 	} else {
9315 		ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs);
9316 		ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs);
9317 		ret |= sfi_set_class_offtime(SFI_CLASS_RUNAWAY_MITIGATION, offtime_usecs);
9318 	}
9319 #endif // CONFIG_THREAD_GROUPS
9320 	return ret;
9321 }
9322 
9323 /*
9324  * Set utility SFI class offtime
9325  */
9326 kern_return_t
sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)9327 sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)
9328 {
9329 	kern_return_t ret = KERN_NOT_SUPPORTED;
9330 #if CONFIG_THREAD_GROUPS
9331 	if (offtime_usecs == 0ULL) {
9332 		ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY);
9333 	} else {
9334 		ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs);
9335 	}
9336 #endif // CONFIG_THREAD_GROUPS
9337 	return ret;
9338 }
9339 
9340 #endif /* __arm64__ */
9341 
9342 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)9343 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
9344 {
9345 }
9346 
9347 void
sched_update_pset_load_average(__unused processor_set_t pset,__unused uint64_t curtime)9348 sched_update_pset_load_average(__unused processor_set_t pset, __unused uint64_t curtime)
9349 {
9350 }
9351 
9352 /* pset is locked */
9353 bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)9354 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
9355 {
9356 	int cpuid = processor->cpu_id;
9357 #if defined(__x86_64__)
9358 	if (sched_avoid_cpu0 && (cpuid == 0)) {
9359 		return false;
9360 	}
9361 #endif
9362 
9363 	cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
9364 
9365 	return bit_test(fasttrack_map, cpuid);
9366 }
9367 
9368 #if CONFIG_SCHED_SMT
9369 /* pset is locked */
9370 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)9371 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
9372 {
9373 	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
9374 	int nbackup_cpus = 0;
9375 
9376 	if (include_backups && rt_runq_is_low_latency(pset)) {
9377 		nbackup_cpus = sched_rt_n_backup_processors;
9378 	}
9379 
9380 	cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
9381 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
9382 		bit_clear(cpu_map, 0);
9383 	}
9384 	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
9385 }
9386 
9387 /* pset is locked */
9388 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)9389 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
9390 {
9391 	int nbackup_cpus = 0;
9392 
9393 	if (include_backups && rt_runq_is_low_latency(pset)) {
9394 		nbackup_cpus = sched_rt_n_backup_processors;
9395 	}
9396 
9397 	cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
9398 	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
9399 }
9400 #endif /* CONFIG_SCHED_SMT */
9401 
9402 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)9403 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
9404 {
9405 	if (!processor->is_recommended) {
9406 		return false;
9407 	}
9408 	bool ok_to_run_realtime_thread = true;
9409 #if CONFIG_SCHED_SMT
9410 	bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
9411 	if (spill_pending) {
9412 		return true;
9413 	}
9414 	if (processor->cpu_id == 0) {
9415 		if (sched_avoid_cpu0 == 1) {
9416 			ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
9417 		} else if (sched_avoid_cpu0 == 2) {
9418 			ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
9419 		}
9420 	} else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
9421 		ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
9422 	} else if (processor->processor_primary != processor) {
9423 		ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
9424 	}
9425 #else /* CONFIG_SCHED_SMT */
9426 	(void)pset;
9427 	(void)processor;
9428 	(void)as_backup;
9429 #endif /* CONFIG_SCHED_SMT */
9430 	return ok_to_run_realtime_thread;
9431 }
9432 
9433 void
sched_pset_made_schedulable(__unused processor_set_t pset)9434 sched_pset_made_schedulable(__unused processor_set_t pset)
9435 {
9436 }
9437 
9438 #if defined(__x86_64__)
9439 void
thread_set_no_smt(bool set)9440 thread_set_no_smt(bool set)
9441 {
9442 	(void) set;
9443 #if CONFIG_SCHED_SMT
9444 	if (!system_is_SMT) {
9445 		/* Not a machine that supports SMT */
9446 		return;
9447 	}
9448 
9449 	thread_t thread = current_thread();
9450 
9451 	spl_t s = splsched();
9452 	thread_lock(thread);
9453 	if (set) {
9454 		thread->sched_flags |= TH_SFLAG_NO_SMT;
9455 	}
9456 	thread_unlock(thread);
9457 	splx(s);
9458 #endif /* CONFIG_SCHED_SMT */
9459 }
9460 #endif /* __x86_64__ */
9461 
9462 
9463 #if CONFIG_SCHED_SMT
9464 bool
thread_get_no_smt(void)9465 thread_get_no_smt(void)
9466 {
9467 	return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
9468 }
9469 
9470 extern void task_set_no_smt(task_t);
9471 void
task_set_no_smt(task_t task)9472 task_set_no_smt(task_t task)
9473 {
9474 	if (!system_is_SMT) {
9475 		/* Not a machine that supports SMT */
9476 		return;
9477 	}
9478 
9479 	if (task == TASK_NULL) {
9480 		task = current_task();
9481 	}
9482 
9483 	task_lock(task);
9484 	task->t_flags |= TF_NO_SMT;
9485 	task_unlock(task);
9486 }
9487 
9488 #if DEBUG || DEVELOPMENT
9489 extern void sysctl_task_set_no_smt(char no_smt);
9490 void
sysctl_task_set_no_smt(char no_smt)9491 sysctl_task_set_no_smt(char no_smt)
9492 {
9493 	if (!system_is_SMT) {
9494 		/* Not a machine that supports SMT */
9495 		return;
9496 	}
9497 
9498 	task_t task = current_task();
9499 
9500 	task_lock(task);
9501 	if (no_smt == '1') {
9502 		task->t_flags |= TF_NO_SMT;
9503 	}
9504 	task_unlock(task);
9505 }
9506 
9507 extern char sysctl_task_get_no_smt(void);
9508 char
sysctl_task_get_no_smt(void)9509 sysctl_task_get_no_smt(void)
9510 {
9511 	task_t task = current_task();
9512 
9513 	if (task->t_flags & TF_NO_SMT) {
9514 		return '1';
9515 	}
9516 	return '0';
9517 }
9518 #endif /* DEVELOPMENT || DEBUG */
9519 #else /* CONFIG_SCHED_SMT */
9520 
9521 extern void task_set_no_smt(task_t);
9522 void
task_set_no_smt(__unused task_t task)9523 task_set_no_smt(__unused task_t task)
9524 {
9525 	return;
9526 }
9527 
9528 #if DEBUG || DEVELOPMENT
9529 extern void sysctl_task_set_no_smt(char no_smt);
9530 void
sysctl_task_set_no_smt(__unused char no_smt)9531 sysctl_task_set_no_smt(__unused char no_smt)
9532 {
9533 	return;
9534 }
9535 
9536 extern char sysctl_task_get_no_smt(void);
9537 char
sysctl_task_get_no_smt(void)9538 sysctl_task_get_no_smt(void)
9539 {
9540 	return '1';
9541 }
9542 #endif /* DEBUG || DEVELOPMENT */
9543 #endif /* CONFIG_SCHED_SMT */
9544 
9545 #if __AMP__
9546 static kern_return_t
pset_cluster_type_from_name_char(char cluster_type_name,pset_cluster_type_t * pset_cluster_type)9547 pset_cluster_type_from_name_char(char cluster_type_name, pset_cluster_type_t *pset_cluster_type)
9548 {
9549 	switch (cluster_type_name) {
9550 	case 'E':
9551 	case 'e':
9552 		*pset_cluster_type = PSET_AMP_E;
9553 		return KERN_SUCCESS;
9554 	case 'P':
9555 	case 'p':
9556 		*pset_cluster_type = PSET_AMP_P;
9557 		return KERN_SUCCESS;
9558 	default:
9559 		return KERN_INVALID_ARGUMENT;
9560 	}
9561 }
9562 #endif /* __AMP__ */
9563 
9564 __private_extern__ kern_return_t
thread_soft_bind_cluster_type(thread_t thread,char cluster_type)9565 thread_soft_bind_cluster_type(thread_t thread, char cluster_type)
9566 {
9567 #if __AMP__
9568 	kern_return_t kr;
9569 	spl_t s = splsched();
9570 	thread_lock(thread);
9571 	thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
9572 	pset_cluster_type_t pset_cluster_type;
9573 	kr = pset_cluster_type_from_name_char(cluster_type, &pset_cluster_type);
9574 	if (kr == KERN_SUCCESS) {
9575 		pset_node_t bind_node = pset_node_for_pset_cluster_type(pset_cluster_type);
9576 		if (bind_node != PSET_NODE_NULL) {
9577 			thread->th_bound_cluster_id = bind_node->psets->pset_id;
9578 		} else {
9579 			/*
9580 			 * The specified cluster type isn't present on the system,
9581 			 * either because we're too early in boot or because the
9582 			 * underlying platform lacks that cluster type. This error
9583 			 * code assumes the latter.
9584 			 */
9585 			kr = KERN_INVALID_ARGUMENT;
9586 		}
9587 	}
9588 	thread_unlock(thread);
9589 	splx(s);
9590 
9591 	if ((kr == KERN_SUCCESS) && (thread == current_thread())) {
9592 		/* Trigger a context-switch to get on the newly bound cluster */
9593 		thread_block(THREAD_CONTINUE_NULL);
9594 	}
9595 	return kr;
9596 #else /* __AMP__ */
9597 	(void)thread;
9598 	(void)cluster_type;
9599 	return KERN_SUCCESS;
9600 #endif /* __AMP__ */
9601 }
9602 
9603 extern uint32_t thread_bound_cluster_id(thread_t thread);
9604 uint32_t
thread_bound_cluster_id(thread_t thread)9605 thread_bound_cluster_id(thread_t thread)
9606 {
9607 	return thread->th_bound_cluster_id;
9608 }
9609 
9610 __private_extern__ kern_return_t
thread_soft_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)9611 thread_soft_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
9612 {
9613 #if __AMP__
9614 	if (cluster_id == THREAD_BOUND_CLUSTER_NONE) {
9615 		/* Treat binding to THREAD_BOUND_CLUSTER_NONE as a request to unbind. */
9616 		options |= THREAD_UNBIND;
9617 	}
9618 
9619 	if (options & THREAD_UNBIND) {
9620 		cluster_id = THREAD_BOUND_CLUSTER_NONE;
9621 	} else {
9622 		/* Validate the specified cluster id */
9623 		int max_clusters = ml_get_cluster_count();
9624 		if (cluster_id >= max_clusters) {
9625 			/* Invalid cluster id */
9626 			return KERN_INVALID_VALUE;
9627 		}
9628 		processor_set_t pset = pset_array[cluster_id];
9629 		if (pset == NULL) {
9630 			/* Cluster has not finished initializing at boot */
9631 			return KERN_FAILURE;
9632 		}
9633 		if (options & THREAD_BIND_ELIGIBLE_ONLY) {
9634 			if (SCHED(thread_eligible_for_pset)(thread, pset) == false) {
9635 				/* Thread is not recommended for the cluster type */
9636 				return KERN_INVALID_POLICY;
9637 			}
9638 		}
9639 	}
9640 
9641 	spl_t s = splsched();
9642 	thread_lock(thread);
9643 
9644 	thread->th_bound_cluster_id = cluster_id;
9645 
9646 	thread_unlock(thread);
9647 	splx(s);
9648 
9649 	if (thread == current_thread()) {
9650 		/* Trigger a context-switch to get on the newly bound cluster */
9651 		thread_block(THREAD_CONTINUE_NULL);
9652 	}
9653 #else /* __AMP__ */
9654 	(void)thread;
9655 	(void)cluster_id;
9656 	(void)options;
9657 #endif /* __AMP__ */
9658 	return KERN_SUCCESS;
9659 }
9660 
9661 #if DEVELOPMENT || DEBUG
9662 extern int32_t sysctl_get_bound_cpuid(void);
9663 int32_t
sysctl_get_bound_cpuid(void)9664 sysctl_get_bound_cpuid(void)
9665 {
9666 	int32_t cpuid = -1;
9667 	thread_t self = current_thread();
9668 
9669 	processor_t processor = self->bound_processor;
9670 	if (processor == NULL) {
9671 		cpuid = -1;
9672 	} else {
9673 		cpuid = processor->cpu_id;
9674 	}
9675 
9676 	return cpuid;
9677 }
9678 
9679 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
9680 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)9681 sysctl_thread_bind_cpuid(int32_t cpuid)
9682 {
9683 	processor_t processor = PROCESSOR_NULL;
9684 
9685 	if (cpuid == -1) {
9686 		goto unbind;
9687 	}
9688 
9689 	if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
9690 		return KERN_INVALID_VALUE;
9691 	}
9692 
9693 	processor = processor_array[cpuid];
9694 	if (processor == PROCESSOR_NULL) {
9695 		return KERN_INVALID_VALUE;
9696 	}
9697 
9698 unbind:
9699 	thread_bind(processor);
9700 
9701 	thread_block(THREAD_CONTINUE_NULL);
9702 	return KERN_SUCCESS;
9703 }
9704 
9705 #if __AMP__
9706 
9707 static char
pset_cluster_type_to_name_char(pset_cluster_type_t pset_type)9708 pset_cluster_type_to_name_char(pset_cluster_type_t pset_type)
9709 {
9710 	switch (pset_type) {
9711 	case PSET_AMP_E:
9712 		return 'E';
9713 	case PSET_AMP_P:
9714 		return 'P';
9715 	default:
9716 		panic("Unexpected AMP pset cluster type %d", pset_type);
9717 	}
9718 }
9719 
9720 #endif /* __AMP__ */
9721 
9722 extern char sysctl_get_task_cluster_type(void);
9723 char
sysctl_get_task_cluster_type(void)9724 sysctl_get_task_cluster_type(void)
9725 {
9726 #if __AMP__
9727 	task_t task = current_task();
9728 	processor_set_t pset_hint = task->pset_hint;
9729 
9730 	if (!pset_hint) {
9731 		return '0';
9732 	}
9733 	return pset_cluster_type_to_name_char(pset_hint->pset_cluster_type);
9734 #else /* !__AMP__ */
9735 	return '0';
9736 #endif /* __AMP__ */
9737 }
9738 
9739 #if __AMP__
9740 extern char sysctl_get_bound_cluster_type(void);
9741 char
sysctl_get_bound_cluster_type(void)9742 sysctl_get_bound_cluster_type(void)
9743 {
9744 	thread_t self = current_thread();
9745 
9746 	if (self->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) {
9747 		return '0';
9748 	}
9749 	pset_cluster_type_t pset_type = pset_array[self->th_bound_cluster_id]->pset_cluster_type;
9750 	return pset_cluster_type_to_name_char(pset_type);
9751 }
9752 
9753 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)9754 find_pset_of_type(pset_cluster_type_t t)
9755 {
9756 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
9757 		if (node->pset_cluster_type != t) {
9758 			continue;
9759 		}
9760 
9761 		processor_set_t pset = PROCESSOR_SET_NULL;
9762 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
9763 			pset = pset_array[pset_id];
9764 			/* Prefer one with recommended processsors */
9765 			if (pset_is_recommended(pset)) {
9766 				assert(pset->pset_cluster_type == t);
9767 				return pset;
9768 			}
9769 		}
9770 		/* Otherwise return whatever was found last */
9771 		return pset;
9772 	}
9773 
9774 	return PROCESSOR_SET_NULL;
9775 }
9776 #endif /* __AMP__ */
9777 
9778 extern kern_return_t sysctl_task_set_cluster_type(char cluster_type);
9779 kern_return_t
sysctl_task_set_cluster_type(char cluster_type)9780 sysctl_task_set_cluster_type(char cluster_type)
9781 {
9782 #if __AMP__
9783 	kern_return_t kr;
9784 	task_t task = current_task();
9785 	pset_cluster_type_t pset_cluster_type;
9786 	kr = pset_cluster_type_from_name_char(cluster_type, &pset_cluster_type);
9787 	if (kr == KERN_SUCCESS) {
9788 		processor_set_t pset_hint = find_pset_of_type(pset_cluster_type);
9789 		if (pset_hint) {
9790 			task_lock(task);
9791 			task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
9792 			task->pset_hint = pset_hint;
9793 			task_unlock(task);
9794 
9795 			thread_block(THREAD_CONTINUE_NULL);
9796 			return KERN_SUCCESS;
9797 		}
9798 	}
9799 	return KERN_INVALID_ARGUMENT;
9800 #else
9801 	(void)cluster_type;
9802 	return KERN_SUCCESS;
9803 #endif
9804 }
9805 
9806 extern kern_return_t sysctl_clutch_thread_group_cpu_time_for_thread(thread_t thread,
9807     int sched_bucket, uint64_t *cpu_stats);
9808 
9809 #if CONFIG_SCHED_CLUTCH
9810 
9811 kern_return_t
sysctl_clutch_thread_group_cpu_time_for_thread(thread_t thread,int sched_bucket,uint64_t * cpu_stats)9812 sysctl_clutch_thread_group_cpu_time_for_thread(thread_t thread,
9813     int sched_bucket, uint64_t *cpu_stats)
9814 {
9815 	return sched_clutch_thread_group_cpu_time_for_thread(thread, sched_bucket, cpu_stats);
9816 }
9817 
9818 #else /* !CONFIG_SCHED_CLUTCH */
9819 
9820 kern_return_t
sysctl_clutch_thread_group_cpu_time_for_thread(__unused thread_t thread,__unused int sched_bucket,__unused uint64_t * cpu_stats)9821 sysctl_clutch_thread_group_cpu_time_for_thread(__unused thread_t thread,
9822     __unused int sched_bucket, __unused uint64_t *cpu_stats)
9823 {
9824 	return KERN_NOT_SUPPORTED;
9825 }
9826 
9827 #endif /* !CONFIG_SCHED_CLUTCH */
9828 
9829 #endif /* DEVELOPMENT || DEBUG */
9830