1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_FREE_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67 #include <debug.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80
81 #include <machine/commpage.h>
82
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/debug.h>
90 #include <kern/macro_help.h>
91 #include <kern/machine.h>
92 #include <kern/misc_protos.h>
93 #include <kern/monotonic.h>
94 #include <kern/processor.h>
95 #include <kern/queue.h>
96 #include <kern/recount.h>
97 #include <kern/restartable.h>
98 #include <kern/sched.h>
99 #include <kern/sched_prim.h>
100 #include <kern/sfi.h>
101 #include <kern/syscall_subr.h>
102 #include <kern/task.h>
103 #include <kern/thread.h>
104 #include <kern/thread_group.h>
105 #include <kern/ledger.h>
106 #include <kern/timer_queue.h>
107 #include <kern/waitq.h>
108 #include <kern/policy_internal.h>
109
110 #include <vm/pmap.h>
111 #include <vm/vm_kern.h>
112 #include <vm/vm_map.h>
113 #include <vm/vm_pageout_xnu.h>
114
115 #include <mach/sdt.h>
116 #include <mach/mach_host.h>
117 #include <mach/host_info.h>
118
119 #include <sys/kdebug.h>
120 #include <kperf/kperf.h>
121 #include <kern/kpc.h>
122 #include <san/kasan.h>
123 #include <kern/pms.h>
124 #include <kern/host.h>
125 #include <stdatomic.h>
126 #include <os/atomic_private.h>
127 #include <os/log.h>
128
129 #ifdef KDBG_MACOS_RELEASE
130 #define KTRC KDBG_MACOS_RELEASE
131 #else
132 #define KTRC KDBG_RELEASE
133 #endif
134
135
136 struct sched_statistics PERCPU_DATA(sched_stats);
137 bool sched_stats_active;
138
139 static uint64_t
deadline_add(uint64_t d,uint64_t e)140 deadline_add(uint64_t d, uint64_t e)
141 {
142 uint64_t sum;
143 return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
144 }
145
146 int
rt_runq_count(processor_set_t pset)147 rt_runq_count(processor_set_t pset)
148 {
149 return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
150 }
151
152 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)153 rt_runq_earliest_deadline(processor_set_t pset)
154 {
155 return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
156 }
157
158 static int
rt_runq_priority(processor_set_t pset)159 rt_runq_priority(processor_set_t pset)
160 {
161 pset_assert_locked(pset);
162 rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
163
164 bitmap_t *map = rt_run_queue->bitmap;
165 int i = bitmap_first(map, NRTQS);
166 assert(i < NRTQS);
167
168 if (i >= 0) {
169 return i + BASEPRI_RTQUEUES;
170 }
171
172 return i;
173 }
174
175 static thread_t rt_runq_first(rt_queue_t rt_runq);
176
177 #if DEBUG
178 static void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)179 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
180 {
181 bitmap_t *map = rt_run_queue->bitmap;
182
183 uint64_t earliest_deadline = RT_DEADLINE_NONE;
184 uint32_t constraint = RT_CONSTRAINT_NONE;
185 int ed_index = NOPRI;
186 int count = 0;
187 bool found_thread = false;
188
189 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
190 int i = pri - BASEPRI_RTQUEUES;
191 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
192 queue_t queue = &rt_runq->pri_queue;
193 queue_entry_t iter;
194 int n = 0;
195 uint64_t previous_deadline = 0;
196 qe_foreach(iter, queue) {
197 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
198 assert_thread_magic(iter_thread);
199 if (iter_thread == thread) {
200 found_thread = true;
201 }
202 assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
203 assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
204 assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
205 assert(previous_deadline <= iter_thread->realtime.deadline);
206 n++;
207 if (iter == queue_first(queue)) {
208 assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
209 assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
210 }
211 previous_deadline = iter_thread->realtime.deadline;
212 }
213 assert(n == rt_runq->pri_count);
214 if (n == 0) {
215 assert(bitmap_test(map, i) == false);
216 assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
217 assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
218 } else {
219 assert(bitmap_test(map, i) == true);
220 }
221 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
222 earliest_deadline = rt_runq->pri_earliest_deadline;
223 constraint = rt_runq->pri_constraint;
224 ed_index = i;
225 }
226 count += n;
227 }
228 assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
229 assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
230 assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
231 assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
232 if (thread) {
233 assert(found_thread);
234 }
235 }
236 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th)
237 #else
238 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0)
239 #endif
240
241 uint32_t rt_constraint_threshold;
242
243 static bool
rt_runq_is_low_latency(processor_set_t pset)244 rt_runq_is_low_latency(processor_set_t pset)
245 {
246 return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
247 }
248
249 TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true);
250
251 /* TODO: enable this, to 50us (less than the deferred IPI latency, to beat a spill) */
252 TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 0); /* microseconds */
253 static uint64_t nonurgent_preemption_timer_abs = 0;
254
255 #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
256 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
257
258 #define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
259 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
260
261 #if XNU_TARGET_OS_XR
262 #define MAX_UNSAFE_RT_QUANTA 1
263 #define SAFE_RT_MULTIPLIER 5
264 #else
265 #define MAX_UNSAFE_RT_QUANTA 100
266 #define SAFE_RT_MULTIPLIER 2
267 #endif /* XNU_TARGET_OS_XR */
268
269 #define MAX_UNSAFE_FIXED_QUANTA 100
270 #define SAFE_FIXED_MULTIPLIER SAFE_RT_MULTIPLIER
271
272 TUNABLE_DEV_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
273 TUNABLE_DEV_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
274
275 TUNABLE_DEV_WRITEABLE(int, safe_rt_multiplier, "safe_rt_multiplier", SAFE_RT_MULTIPLIER);
276 TUNABLE_DEV_WRITEABLE(int, safe_fixed_multiplier, "safe_fixed_multiplier", SAFE_FIXED_MULTIPLIER);
277
278 #define MAX_POLL_QUANTA 2
279 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
280
281 #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
282 int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
283
284 uint64_t max_poll_computation;
285
286 uint64_t max_unsafe_rt_computation;
287 uint64_t max_unsafe_fixed_computation;
288 uint64_t sched_safe_rt_duration;
289 uint64_t sched_safe_fixed_duration;
290
291 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
292
293 uint32_t std_quantum;
294 uint32_t min_std_quantum;
295 uint32_t bg_quantum;
296
297 uint32_t std_quantum_us;
298 uint32_t bg_quantum_us;
299
300 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
301
302 uint32_t thread_depress_time;
303 uint32_t default_timeshare_computation;
304 uint32_t default_timeshare_constraint;
305
306 uint32_t max_rt_quantum;
307 uint32_t min_rt_quantum;
308
309 uint32_t rt_deadline_epsilon;
310
311 uint32_t rt_constraint_threshold;
312
313 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
314
315 unsigned sched_tick;
316 uint32_t sched_tick_interval;
317
318 /* Timeshare load calculation interval (15ms) */
319 uint32_t sched_load_compute_interval_us = 15000;
320 uint64_t sched_load_compute_interval_abs;
321 static _Atomic uint64_t sched_load_compute_deadline;
322
323 uint32_t sched_pri_shifts[TH_BUCKET_MAX];
324 uint32_t sched_fixed_shift;
325
326 uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
327
328 /* Allow foreground to decay past default to resolve inversions */
329 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
330 int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
331
332 /* Defaults for timer deadline profiling */
333 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
334 * 2ms */
335 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
336 * <= 5ms */
337
338 uint64_t timer_deadline_tracking_bin_1;
339 uint64_t timer_deadline_tracking_bin_2;
340
341 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
342
343 thread_t sched_maintenance_thread;
344
345 LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown");
346 LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp);
347
348 /* interrupts disabled lock to guard core online, recommendation, pcs state */
349 decl_simple_lock_data(, sched_available_cores_lock);
350
351 /*
352 * Locked by sched_available_cores_lock.
353 * cluster_powerdown_lock is held while making changes to CPU offline state.
354 */
355 static struct global_powered_cores_state {
356 /*
357 * Set when PCS has seen all cores boot up and is ready to manage online
358 * state. CPU recommendation works before this point.
359 */
360 bool pcs_init_completed;
361
362 cpumap_t pcs_managed_cores; /* all cores managed by the PCS */
363
364 /*
365 * Inputs for CPU offline state provided by clients
366 */
367 cpumap_t pcs_requested_online_user; /* updated by processor_start/exit from userspace */
368 cpumap_t pcs_requested_online_clpc_user;
369 cpumap_t pcs_requested_online_clpc_system;
370 cpumap_t pcs_required_online_pmgr; /* e.g. ANE needs these powered for their rail to be happy */
371 cpumap_t pcs_required_online_system; /* e.g. smt1 for interrupts, boot processor unless boot arg is set, makes them disable instead of sleep */
372
373 /*
374 * When a suspend count is held, all CPUs must be powered up.
375 */
376 int32_t pcs_powerdown_suspend_count;
377
378 /*
379 * Disable automatic cluster powerdown in favor of explicit user core online control
380 */
381 bool pcs_user_online_core_control;
382 bool pcs_wants_kernel_sleep;
383 bool pcs_in_kernel_sleep;
384
385 struct powered_cores_state {
386 /*
387 * The input into the recommendation computation from update powered cores.
388 */
389 cpumap_t pcs_powerdown_recommended_cores;
390
391 /*
392 * These cores are online and are not powered down.
393 *
394 * Processors with processor->processor_online bit set.
395 */
396 cpumap_t pcs_online_cores;
397
398 /*
399 * These cores are disabled or powered down
400 * due to temporary reasons and will come back under presented load
401 * so the user should still see them as active in the cpu count.
402 *
403 * Processors with processor->shutdown_temporary bit set.
404 */
405 cpumap_t pcs_tempdown_cores;
406 } pcs_effective;
407
408 /* The 'goal state' PCS has computed and is attempting to apply */
409 struct powered_cores_state pcs_requested;
410
411 /*
412 * Inputs into CPU recommended cores provided by clients.
413 * Note that these may be changed under the available cores lock and
414 * become effective while sched_update_powered_cores_drops_lock is in
415 * the middle of making changes to CPU online state.
416 */
417
418 cpumap_t pcs_requested_recommended_clpc;
419 cpumap_t pcs_requested_recommended_clpc_system;
420 cpumap_t pcs_requested_recommended_clpc_user;
421 bool pcs_recommended_clpc_failsafe_active;
422 bool pcs_sleep_override_recommended;
423
424 /*
425 * These cores are recommended and can be used for execution
426 * of non-bound threads.
427 *
428 * Processors with processor->is_recommended bit set.
429 */
430 cpumap_t pcs_recommended_cores;
431
432 /*
433 * These are for the debugger.
434 * Use volatile to stop the compiler from optimizing out the stores
435 */
436 volatile processor_reason_t pcs_in_flight_reason;
437 volatile processor_reason_t pcs_previous_reason;
438 } pcs = {
439 /*
440 * Powerdown is suspended during boot until after all CPUs finish booting,
441 * released by sched_cpu_init_completed.
442 */
443 .pcs_powerdown_suspend_count = 1,
444 .pcs_requested_online_user = ALL_CORES_POWERED,
445 .pcs_requested_online_clpc_user = ALL_CORES_POWERED,
446 .pcs_requested_online_clpc_system = ALL_CORES_POWERED,
447 .pcs_in_flight_reason = REASON_NONE,
448 .pcs_previous_reason = REASON_NONE,
449 .pcs_requested.pcs_powerdown_recommended_cores = ALL_CORES_POWERED,
450 .pcs_requested_recommended_clpc = ALL_CORES_RECOMMENDED,
451 .pcs_requested_recommended_clpc_system = ALL_CORES_RECOMMENDED,
452 .pcs_requested_recommended_clpc_user = ALL_CORES_RECOMMENDED,
453 };
454
455 uint64_t sysctl_sched_recommended_cores = ALL_CORES_RECOMMENDED;
456
457 static int sched_last_resort_cpu(void);
458
459 static void sched_update_recommended_cores_locked(processor_reason_t reason, cpumap_t core_going_offline);
460 static void sched_update_powered_cores_drops_lock(processor_reason_t requested_reason, spl_t s);
461
462 #if __arm64__
463 static void sched_recommended_cores_maintenance(void);
464 uint64_t perfcontrol_failsafe_starvation_threshold;
465 extern char *proc_name_address(struct proc *p);
466 #endif /* __arm64__ */
467
468 uint64_t sched_one_second_interval;
469 boolean_t allow_direct_handoff = TRUE;
470
471 /* Forwards */
472
473 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
474
475 static void load_shift_init(void);
476 static void preempt_pri_init(void);
477
478 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
479
480 thread_t processor_idle(
481 thread_t thread,
482 processor_t processor);
483
484 static ast_t
485 csw_check_locked(
486 thread_t thread,
487 processor_t processor,
488 processor_set_t pset,
489 ast_t check_reason);
490
491 static void processor_setrun(
492 processor_t processor,
493 thread_t thread,
494 integer_t options);
495
496 static void
497 sched_realtime_timebase_init(void);
498
499 static void
500 sched_timer_deadline_tracking_init(void);
501
502 #if DEBUG
503 extern int debug_task;
504 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
505 #else
506 #define TLOG(a, fmt, args...) do {} while (0)
507 #endif
508
509 static processor_t
510 thread_bind_internal(
511 thread_t thread,
512 processor_t processor);
513
514 static void
515 sched_vm_group_maintenance(void);
516
517 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
518 int8_t sched_load_shifts[NRQS];
519 bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
520 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
521
522 #define cpumap_foreach(cpu_id, cpumap) \
523 for (int cpu_id = lsb_first(cpumap); \
524 (cpu_id) >= 0; \
525 cpu_id = lsb_next((cpumap), cpu_id))
526
527 #define foreach_node(node) \
528 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list)
529
530 #define foreach_pset_id(pset_id, node) \
531 for (int pset_id = lsb_first((node)->pset_map); \
532 pset_id >= 0; \
533 pset_id = lsb_next((node)->pset_map, pset_id))
534
535 /*
536 * Statically allocate a buffer to hold the longest possible
537 * scheduler description string, as currently implemented.
538 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
539 * to export to userspace via sysctl(3). If either version
540 * changes, update the other.
541 *
542 * Note that in addition to being an upper bound on the strings
543 * in the kernel, it's also an exact parameter to PE_get_default(),
544 * which interrogates the device tree on some platforms. That
545 * API requires the caller know the exact size of the device tree
546 * property, so we need both a legacy size (32) and the current size
547 * (48) to deal with old and new device trees. The device tree property
548 * is similarly padded to a fixed size so that the same kernel image
549 * can run on multiple devices with different schedulers configured
550 * in the device tree.
551 */
552 char sched_string[SCHED_STRING_MAX_LENGTH];
553
554 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
555
556 /* Global flag which indicates whether Background Stepper Context is enabled */
557 static int cpu_throttle_enabled = 1;
558
559 #if DEVELOPMENT || DEBUG
560 int enable_task_set_cluster_type = 0;
561 bool system_ecore_only = false;
562 #endif /* DEVELOPMENT || DEBUG */
563
564 void
sched_init(void)565 sched_init(void)
566 {
567 boolean_t direct_handoff = FALSE;
568 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
569
570 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
571 /* No boot-args, check in device tree */
572 if (!PE_get_default("kern.sched_pri_decay_limit",
573 &sched_pri_decay_band_limit,
574 sizeof(sched_pri_decay_band_limit))) {
575 /* Allow decay all the way to normal limits */
576 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
577 }
578 }
579
580 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
581
582 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
583 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
584 }
585 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
586
587 #if __arm64__
588 clock_interval_to_absolutetime_interval(expecting_ipi_wfe_timeout_usec, NSEC_PER_USEC, &expecting_ipi_wfe_timeout_mt);
589 #endif /* __arm64__ */
590
591 SCHED(init)();
592 SCHED(rt_init)(&pset0);
593 sched_timer_deadline_tracking_init();
594
595 SCHED(pset_init)(&pset0);
596 SCHED(processor_init)(master_processor);
597
598 if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
599 allow_direct_handoff = direct_handoff;
600 }
601
602 #if DEVELOPMENT || DEBUG
603 if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
604 system_ecore_only = (enable_task_set_cluster_type == 2);
605 }
606 #endif /* DEVELOPMENT || DEBUG */
607 }
608
609 void
sched_timebase_init(void)610 sched_timebase_init(void)
611 {
612 uint64_t abstime;
613
614 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
615 sched_one_second_interval = abstime;
616
617 SCHED(timebase_init)();
618 sched_realtime_timebase_init();
619 }
620
621 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
622
623 void
sched_timeshare_init(void)624 sched_timeshare_init(void)
625 {
626 /*
627 * Calculate the timeslicing quantum
628 * in us.
629 */
630 if (default_preemption_rate < 1) {
631 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
632 }
633 std_quantum_us = (1000 * 1000) / default_preemption_rate;
634
635 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
636
637 if (default_bg_preemption_rate < 1) {
638 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
639 }
640 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
641
642 printf("standard background quantum is %d us\n", bg_quantum_us);
643
644 load_shift_init();
645 preempt_pri_init();
646 sched_tick = 0;
647 }
648
649 void
sched_set_max_unsafe_rt_quanta(int max)650 sched_set_max_unsafe_rt_quanta(int max)
651 {
652 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
653
654 max_unsafe_rt_computation = ((uint64_t)max) * quantum_size;
655
656 const int mult = safe_rt_multiplier <= 0 ? 2 : safe_rt_multiplier;
657 sched_safe_rt_duration = mult * ((uint64_t)max) * quantum_size;
658
659
660 #if DEVELOPMENT || DEBUG
661 max_unsafe_rt_quanta = max;
662 #else
663 /*
664 * On RELEASE kernels, this is only called on boot where
665 * max is already equal to max_unsafe_rt_quanta.
666 */
667 assert3s(max, ==, max_unsafe_rt_quanta);
668 #endif
669 }
670
671 void
sched_set_max_unsafe_fixed_quanta(int max)672 sched_set_max_unsafe_fixed_quanta(int max)
673 {
674 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
675
676 max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size;
677
678 const int mult = safe_fixed_multiplier <= 0 ? 2 : safe_fixed_multiplier;
679 sched_safe_fixed_duration = mult * ((uint64_t)max) * quantum_size;
680
681 #if DEVELOPMENT || DEBUG
682 max_unsafe_fixed_quanta = max;
683 #else
684 /*
685 * On RELEASE kernels, this is only called on boot where
686 * max is already equal to max_unsafe_fixed_quanta.
687 */
688 assert3s(max, ==, max_unsafe_fixed_quanta);
689 #endif
690 }
691
692 uint64_t
sched_get_quantum_us(void)693 sched_get_quantum_us(void)
694 {
695 uint32_t quantum = SCHED(initial_quantum_size)(THREAD_NULL);
696
697 uint64_t quantum_ns;
698 absolutetime_to_nanoseconds(quantum, &quantum_ns);
699
700 return quantum_ns / 1000;
701 }
702
703 void
sched_timeshare_timebase_init(void)704 sched_timeshare_timebase_init(void)
705 {
706 uint64_t abstime;
707 uint32_t shift;
708
709 /* standard timeslicing quantum */
710 clock_interval_to_absolutetime_interval(
711 std_quantum_us, NSEC_PER_USEC, &abstime);
712 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
713 std_quantum = (uint32_t)abstime;
714
715 /* smallest remaining quantum (250 us) */
716 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
717 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
718 min_std_quantum = (uint32_t)abstime;
719
720 /* quantum for background tasks */
721 clock_interval_to_absolutetime_interval(
722 bg_quantum_us, NSEC_PER_USEC, &abstime);
723 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
724 bg_quantum = (uint32_t)abstime;
725
726 /* scheduler tick interval */
727 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
728 NSEC_PER_USEC, &abstime);
729 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
730 sched_tick_interval = (uint32_t)abstime;
731
732 /* timeshare load calculation interval & deadline initialization */
733 clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
734 os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
735
736 /*
737 * Compute conversion factor from usage to
738 * timesharing priorities with 5/8 ** n aging.
739 */
740 abstime = (abstime * 5) / 3;
741 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
742 abstime >>= 1;
743 }
744 sched_fixed_shift = shift;
745
746 for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
747 sched_pri_shifts[i] = INT8_MAX;
748 }
749
750 sched_set_max_unsafe_rt_quanta(max_unsafe_rt_quanta);
751 sched_set_max_unsafe_fixed_quanta(max_unsafe_fixed_quanta);
752
753 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
754 thread_depress_time = 1 * std_quantum;
755 default_timeshare_computation = std_quantum / 2;
756 default_timeshare_constraint = std_quantum;
757
758 #if __arm64__
759 perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
760 #endif /* __arm64__ */
761
762 if (nonurgent_preemption_timer_us) {
763 clock_interval_to_absolutetime_interval(nonurgent_preemption_timer_us, NSEC_PER_USEC, &abstime);
764 nonurgent_preemption_timer_abs = abstime;
765 }
766 }
767
768 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
769
770 void
pset_rt_init(processor_set_t pset)771 pset_rt_init(processor_set_t pset)
772 {
773 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
774 int i = pri - BASEPRI_RTQUEUES;
775 rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
776 queue_init(&rqi->pri_queue);
777 rqi->pri_count = 0;
778 rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
779 rqi->pri_constraint = RT_CONSTRAINT_NONE;
780 }
781 os_atomic_init(&pset->rt_runq.count, 0);
782 os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
783 os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
784 os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
785 memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
786 }
787
788 /* epsilon for comparing RT deadlines */
789 int rt_deadline_epsilon_us = 100;
790
791 int
sched_get_rt_deadline_epsilon(void)792 sched_get_rt_deadline_epsilon(void)
793 {
794 return rt_deadline_epsilon_us;
795 }
796
797 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)798 sched_set_rt_deadline_epsilon(int new_epsilon_us)
799 {
800 rt_deadline_epsilon_us = new_epsilon_us;
801
802 uint64_t abstime;
803 clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
804 assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
805 rt_deadline_epsilon = (uint32_t)abstime;
806 }
807
808 static void
sched_realtime_timebase_init(void)809 sched_realtime_timebase_init(void)
810 {
811 uint64_t abstime;
812
813 /* smallest rt computation (50 us) */
814 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
815 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
816 min_rt_quantum = (uint32_t)abstime;
817
818 /* maximum rt computation (50 ms) */
819 clock_interval_to_absolutetime_interval(
820 50, 1000 * NSEC_PER_USEC, &abstime);
821 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
822 max_rt_quantum = (uint32_t)abstime;
823
824 /* constraint threshold for sending backup IPIs (4 ms) */
825 clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
826 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
827 rt_constraint_threshold = (uint32_t)abstime;
828
829 /* epsilon for comparing deadlines */
830 sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
831 }
832
833 void
sched_check_spill(processor_set_t pset,thread_t thread)834 sched_check_spill(processor_set_t pset, thread_t thread)
835 {
836 (void)pset;
837 (void)thread;
838
839 return;
840 }
841
842 bool
sched_thread_should_yield(processor_t processor,thread_t thread)843 sched_thread_should_yield(processor_t processor, thread_t thread)
844 {
845 (void)thread;
846
847 return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
848 }
849
850 /* Default implementations of .steal_thread_enabled */
851 bool
sched_steal_thread_DISABLED(processor_set_t pset)852 sched_steal_thread_DISABLED(processor_set_t pset)
853 {
854 (void)pset;
855 return false;
856 }
857
858 bool
sched_steal_thread_enabled(processor_set_t pset)859 sched_steal_thread_enabled(processor_set_t pset)
860 {
861 return bit_count(pset->node->pset_map) > 1;
862 }
863
864 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
865
866 /*
867 * Set up values for timeshare
868 * loading factors.
869 */
870 static void
load_shift_init(void)871 load_shift_init(void)
872 {
873 int8_t k, *p = sched_load_shifts;
874 uint32_t i, j;
875
876 uint32_t sched_decay_penalty = 1;
877
878 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
879 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
880 }
881
882 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
883 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
884 }
885
886 if (sched_decay_penalty == 0) {
887 /*
888 * There is no penalty for timeshare threads for using too much
889 * CPU, so set all load shifts to INT8_MIN. Even under high load,
890 * sched_pri_shift will be >INT8_MAX, and there will be no
891 * penalty applied to threads (nor will sched_usage be updated per
892 * thread).
893 */
894 for (i = 0; i < NRQS; i++) {
895 sched_load_shifts[i] = INT8_MIN;
896 }
897
898 return;
899 }
900
901 *p++ = INT8_MIN; *p++ = 0;
902
903 /*
904 * For a given system load "i", the per-thread priority
905 * penalty per quantum of CPU usage is ~2^k priority
906 * levels. "sched_decay_penalty" can cause more
907 * array entries to be filled with smaller "k" values
908 */
909 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
910 for (j <<= 1; (i < j) && (i < NRQS); ++i) {
911 *p++ = k;
912 }
913 }
914 }
915
916 static void
preempt_pri_init(void)917 preempt_pri_init(void)
918 {
919 bitmap_t *p = sched_preempt_pri;
920
921 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
922 bitmap_set(p, i);
923 }
924
925 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
926 bitmap_set(p, i);
927 }
928 }
929
930 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
931
932 void
check_monotonic_time(uint64_t ctime)933 check_monotonic_time(uint64_t ctime)
934 {
935 processor_t processor = current_processor();
936 uint64_t last_dispatch = processor->last_dispatch;
937
938 if (last_dispatch > ctime) {
939 panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
940 last_dispatch, ctime);
941 }
942 }
943
944
945 /*
946 * Thread wait timer expiration.
947 * Runs in timer interrupt context with interrupts disabled.
948 */
949 void
thread_timer_expire(void * p0,__unused void * p1)950 thread_timer_expire(void *p0, __unused void *p1)
951 {
952 thread_t thread = (thread_t)p0;
953
954 assert_thread_magic(thread);
955
956 assert(ml_get_interrupts_enabled() == FALSE);
957
958 thread_lock(thread);
959
960 if (thread->wait_timer_armed) {
961 thread->wait_timer_armed = false;
962 clear_wait_internal(thread, THREAD_TIMED_OUT);
963 /* clear_wait_internal may have dropped and retaken the thread lock */
964 }
965
966 thread->wait_timer_active--;
967
968 thread_unlock(thread);
969 }
970
971 /*
972 * thread_unblock:
973 *
974 * Unblock thread on wake up.
975 *
976 * Returns TRUE if the thread should now be placed on the runqueue.
977 *
978 * Thread must be locked.
979 *
980 * Called at splsched().
981 */
982 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)983 thread_unblock(
984 thread_t thread,
985 wait_result_t wresult)
986 {
987 boolean_t ready_for_runq = FALSE;
988 thread_t cthread = current_thread();
989 uint32_t new_run_count;
990 int old_thread_state;
991
992 /*
993 * Set wait_result.
994 */
995 thread->wait_result = wresult;
996
997 /*
998 * Cancel pending wait timer.
999 */
1000 if (thread->wait_timer_armed) {
1001 if (timer_call_cancel(thread->wait_timer)) {
1002 thread->wait_timer_active--;
1003 }
1004 thread->wait_timer_armed = false;
1005 }
1006
1007 boolean_t aticontext, pidle;
1008 ml_get_power_state(&aticontext, &pidle);
1009
1010 /*
1011 * Update scheduling state: not waiting,
1012 * set running.
1013 */
1014 old_thread_state = thread->state;
1015 thread->state = (old_thread_state | TH_RUN) &
1016 ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT | TH_WAKING);
1017
1018 if ((old_thread_state & TH_RUN) == 0) {
1019 uint64_t ctime = mach_approximate_time();
1020
1021 check_monotonic_time(ctime);
1022
1023 thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
1024 timer_start(&thread->runnable_timer, ctime);
1025
1026 ready_for_runq = TRUE;
1027
1028 if (old_thread_state & TH_WAIT_REPORT) {
1029 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
1030 }
1031
1032 /* Update the runnable thread count */
1033 new_run_count = SCHED(run_count_incr)(thread);
1034
1035 #if CONFIG_SCHED_AUTO_JOIN
1036 if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
1037 work_interval_auto_join_propagate(cthread, thread);
1038 }
1039 #endif /*CONFIG_SCHED_AUTO_JOIN */
1040
1041 } else {
1042 /*
1043 * Either the thread is idling in place on another processor,
1044 * or it hasn't finished context switching yet.
1045 */
1046 assert((thread->state & TH_IDLE) == 0);
1047 /*
1048 * The run count is only dropped after the context switch completes
1049 * and the thread is still waiting, so we should not run_incr here
1050 */
1051 new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
1052 }
1053
1054 /*
1055 * Calculate deadline for real-time threads.
1056 */
1057 if (thread->sched_mode == TH_MODE_REALTIME) {
1058 uint64_t ctime = mach_absolute_time();
1059 thread->realtime.deadline = thread->realtime.constraint + ctime;
1060 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
1061 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
1062 }
1063
1064 /*
1065 * Clear old quantum, fail-safe computation, etc.
1066 */
1067 thread->quantum_remaining = 0;
1068 thread->computation_metered = 0;
1069 thread->reason = AST_NONE;
1070 thread->block_hint = kThreadWaitNone;
1071
1072 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
1073 * We also account for "double hop" thread signaling via
1074 * the thread callout infrastructure.
1075 * DRK: consider removing the callout wakeup counters in the future
1076 * they're present for verification at the moment.
1077 */
1078
1079 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
1080 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
1081
1082 uint64_t ttd = current_processor()->timer_call_ttd;
1083
1084 if (ttd) {
1085 if (ttd <= timer_deadline_tracking_bin_1) {
1086 thread->thread_timer_wakeups_bin_1++;
1087 } else if (ttd <= timer_deadline_tracking_bin_2) {
1088 thread->thread_timer_wakeups_bin_2++;
1089 }
1090 }
1091
1092 ledger_credit_thread(thread, thread->t_ledger,
1093 task_ledgers.interrupt_wakeups, 1);
1094 if (pidle) {
1095 ledger_credit_thread(thread, thread->t_ledger,
1096 task_ledgers.platform_idle_wakeups, 1);
1097 }
1098 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
1099 /* TODO: what about an interrupt that does a wake taken on a callout thread? */
1100 if (cthread->callout_woken_from_icontext) {
1101 ledger_credit_thread(thread, thread->t_ledger,
1102 task_ledgers.interrupt_wakeups, 1);
1103 thread->thread_callout_interrupt_wakeups++;
1104
1105 if (cthread->callout_woken_from_platform_idle) {
1106 ledger_credit_thread(thread, thread->t_ledger,
1107 task_ledgers.platform_idle_wakeups, 1);
1108 thread->thread_callout_platform_idle_wakeups++;
1109 }
1110
1111 cthread->callout_woke_thread = TRUE;
1112 }
1113 }
1114
1115 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
1116 thread->callout_woken_from_icontext = !!aticontext;
1117 thread->callout_woken_from_platform_idle = !!pidle;
1118 thread->callout_woke_thread = FALSE;
1119 }
1120
1121 #if KPERF
1122 if (ready_for_runq) {
1123 kperf_make_runnable(thread, aticontext);
1124 }
1125 #endif /* KPERF */
1126
1127 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1128 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
1129 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
1130 sched_run_buckets[TH_BUCKET_RUN], 0);
1131
1132 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
1133
1134 return ready_for_runq;
1135 }
1136
1137 /*
1138 * Routine: thread_allowed_for_handoff
1139 * Purpose:
1140 * Check if the thread is allowed for handoff operation
1141 * Conditions:
1142 * thread lock held, IPC locks may be held.
1143 * TODO: In future, do not allow handoff if threads have different cluster
1144 * recommendations.
1145 */
1146 boolean_t
thread_allowed_for_handoff(thread_t thread)1147 thread_allowed_for_handoff(
1148 thread_t thread)
1149 {
1150 thread_t self = current_thread();
1151
1152 if (allow_direct_handoff &&
1153 thread->sched_mode == TH_MODE_REALTIME &&
1154 self->sched_mode == TH_MODE_REALTIME) {
1155 return TRUE;
1156 }
1157
1158 return FALSE;
1159 }
1160
1161 /*
1162 * Routine: thread_go
1163 * Purpose:
1164 * Unblock and dispatch thread.
1165 * Conditions:
1166 * thread lock held, IPC locks may be held.
1167 * thread must have been waiting
1168 */
1169 void
thread_go(thread_t thread,wait_result_t wresult,bool try_handoff)1170 thread_go(
1171 thread_t thread,
1172 wait_result_t wresult,
1173 bool try_handoff)
1174 {
1175 thread_t self = current_thread();
1176
1177 assert_thread_magic(thread);
1178
1179 assert(thread->at_safe_point == FALSE);
1180 assert(thread->wait_event == NO_EVENT64);
1181 assert(waitq_is_null(thread->waitq));
1182
1183 assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1184 assert(thread->state & TH_WAIT);
1185
1186 if (thread->started) {
1187 assert(thread->state & TH_WAKING);
1188 }
1189
1190 thread_lock_assert(thread, LCK_ASSERT_OWNED);
1191
1192 assert(ml_get_interrupts_enabled() == false);
1193
1194 if (thread_unblock(thread, wresult)) {
1195 #if SCHED_TRACE_THREAD_WAKEUPS
1196 backtrace(&thread->thread_wakeup_bt[0],
1197 (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1198 NULL);
1199 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1200 if (try_handoff && thread_allowed_for_handoff(thread)) {
1201 thread_reference(thread);
1202 assert(self->handoff_thread == NULL);
1203 self->handoff_thread = thread;
1204
1205 /*
1206 * A TH_RUN'ed thread must have a chosen_processor.
1207 * thread_setrun would have set it, so we need to
1208 * replicate that here.
1209 */
1210 thread->chosen_processor = current_processor();
1211 } else {
1212 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1213 }
1214 }
1215 }
1216
1217 /*
1218 * Routine: thread_mark_wait_locked
1219 * Purpose:
1220 * Mark a thread as waiting. If, given the circumstances,
1221 * it doesn't want to wait (i.e. already aborted), then
1222 * indicate that in the return value.
1223 * Conditions:
1224 * at splsched() and thread is locked.
1225 */
1226 __private_extern__
1227 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1228 thread_mark_wait_locked(
1229 thread_t thread,
1230 wait_interrupt_t interruptible_orig)
1231 {
1232 boolean_t at_safe_point;
1233 wait_interrupt_t interruptible = interruptible_orig;
1234
1235 if (thread->state & TH_IDLE) {
1236 panic("Invalid attempt to wait while running the idle thread");
1237 }
1238
1239 assert(!(thread->state & (TH_WAIT | TH_WAKING | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1240
1241 /*
1242 * The thread may have certain types of interrupts/aborts masked
1243 * off. Even if the wait location says these types of interrupts
1244 * are OK, we have to honor mask settings (outer-scoped code may
1245 * not be able to handle aborts at the moment).
1246 */
1247 interruptible &= TH_OPT_INTMASK;
1248 if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1249 interruptible = thread->options & TH_OPT_INTMASK;
1250 }
1251
1252 at_safe_point = (interruptible == THREAD_ABORTSAFE);
1253
1254 if (interruptible == THREAD_UNINT ||
1255 !(thread->sched_flags & TH_SFLAG_ABORT) ||
1256 (!at_safe_point &&
1257 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1258 if (!(thread->state & TH_TERMINATE)) {
1259 DTRACE_SCHED(sleep);
1260 }
1261
1262 int state_bits = TH_WAIT;
1263 if (!interruptible) {
1264 state_bits |= TH_UNINT;
1265 }
1266 if (thread->sched_call) {
1267 wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1268 if (is_kerneltask(get_threadtask(thread))) {
1269 mask = THREAD_WAIT_NOREPORT_KERNEL;
1270 }
1271 if ((interruptible_orig & mask) == 0) {
1272 state_bits |= TH_WAIT_REPORT;
1273 }
1274 }
1275 thread->state |= state_bits;
1276 thread->at_safe_point = at_safe_point;
1277
1278 /* TODO: pass this through assert_wait instead, have
1279 * assert_wait just take a struct as an argument */
1280 assert(!thread->block_hint);
1281 thread->block_hint = thread->pending_block_hint;
1282 thread->pending_block_hint = kThreadWaitNone;
1283
1284 return thread->wait_result = THREAD_WAITING;
1285 } else {
1286 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1287 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1288 }
1289 }
1290 thread->pending_block_hint = kThreadWaitNone;
1291
1292 return thread->wait_result = THREAD_INTERRUPTED;
1293 }
1294
1295 /*
1296 * Routine: thread_interrupt_level
1297 * Purpose:
1298 * Set the maximum interruptible state for the
1299 * current thread. The effective value of any
1300 * interruptible flag passed into assert_wait
1301 * will never exceed this.
1302 *
1303 * Useful for code that must not be interrupted,
1304 * but which calls code that doesn't know that.
1305 * Returns:
1306 * The old interrupt level for the thread.
1307 */
1308 __private_extern__
1309 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1310 thread_interrupt_level(
1311 wait_interrupt_t new_level)
1312 {
1313 thread_t thread = current_thread();
1314 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1315
1316 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1317
1318 return result;
1319 }
1320
1321 /*
1322 * assert_wait:
1323 *
1324 * Assert that the current thread is about to go to
1325 * sleep until the specified event occurs.
1326 */
1327 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1328 assert_wait(
1329 event_t event,
1330 wait_interrupt_t interruptible)
1331 {
1332 if (__improbable(event == NO_EVENT)) {
1333 panic("%s() called with NO_EVENT", __func__);
1334 }
1335
1336 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1337 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1338 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1339
1340 struct waitq *waitq;
1341 waitq = global_eventq(event);
1342 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1343 }
1344
1345 /*
1346 * assert_wait_queue:
1347 *
1348 * Return the global waitq for the specified event
1349 */
1350 struct waitq *
assert_wait_queue(event_t event)1351 assert_wait_queue(
1352 event_t event)
1353 {
1354 return global_eventq(event);
1355 }
1356
1357 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1358 assert_wait_timeout(
1359 event_t event,
1360 wait_interrupt_t interruptible,
1361 uint32_t interval,
1362 uint32_t scale_factor)
1363 {
1364 thread_t thread = current_thread();
1365 wait_result_t wresult;
1366 uint64_t deadline;
1367 spl_t s;
1368
1369 if (__improbable(event == NO_EVENT)) {
1370 panic("%s() called with NO_EVENT", __func__);
1371 }
1372
1373 struct waitq *waitq;
1374 waitq = global_eventq(event);
1375
1376 s = splsched();
1377 waitq_lock(waitq);
1378
1379 clock_interval_to_deadline(interval, scale_factor, &deadline);
1380
1381 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1382 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1383 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1384
1385 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1386 interruptible,
1387 TIMEOUT_URGENCY_SYS_NORMAL,
1388 deadline, TIMEOUT_NO_LEEWAY,
1389 thread);
1390
1391 waitq_unlock(waitq);
1392 splx(s);
1393 return wresult;
1394 }
1395
1396 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1397 assert_wait_timeout_with_leeway(
1398 event_t event,
1399 wait_interrupt_t interruptible,
1400 wait_timeout_urgency_t urgency,
1401 uint32_t interval,
1402 uint32_t leeway,
1403 uint32_t scale_factor)
1404 {
1405 thread_t thread = current_thread();
1406 wait_result_t wresult;
1407 uint64_t deadline;
1408 uint64_t abstime;
1409 uint64_t slop;
1410 uint64_t now;
1411 spl_t s;
1412
1413 if (__improbable(event == NO_EVENT)) {
1414 panic("%s() called with NO_EVENT", __func__);
1415 }
1416
1417 now = mach_absolute_time();
1418 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1419 deadline = now + abstime;
1420
1421 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1422
1423 struct waitq *waitq;
1424 waitq = global_eventq(event);
1425
1426 s = splsched();
1427 waitq_lock(waitq);
1428
1429 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1430 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1431 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1432
1433 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1434 interruptible,
1435 urgency, deadline, slop,
1436 thread);
1437
1438 waitq_unlock(waitq);
1439 splx(s);
1440 return wresult;
1441 }
1442
1443 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1444 assert_wait_deadline(
1445 event_t event,
1446 wait_interrupt_t interruptible,
1447 uint64_t deadline)
1448 {
1449 thread_t thread = current_thread();
1450 wait_result_t wresult;
1451 spl_t s;
1452
1453 if (__improbable(event == NO_EVENT)) {
1454 panic("%s() called with NO_EVENT", __func__);
1455 }
1456
1457 struct waitq *waitq;
1458 waitq = global_eventq(event);
1459
1460 s = splsched();
1461 waitq_lock(waitq);
1462
1463 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1464 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1465 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1466
1467 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1468 interruptible,
1469 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1470 TIMEOUT_NO_LEEWAY, thread);
1471 waitq_unlock(waitq);
1472 splx(s);
1473 return wresult;
1474 }
1475
1476 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1477 assert_wait_deadline_with_leeway(
1478 event_t event,
1479 wait_interrupt_t interruptible,
1480 wait_timeout_urgency_t urgency,
1481 uint64_t deadline,
1482 uint64_t leeway)
1483 {
1484 thread_t thread = current_thread();
1485 wait_result_t wresult;
1486 spl_t s;
1487
1488 if (__improbable(event == NO_EVENT)) {
1489 panic("%s() called with NO_EVENT", __func__);
1490 }
1491
1492 struct waitq *waitq;
1493 waitq = global_eventq(event);
1494
1495 s = splsched();
1496 waitq_lock(waitq);
1497
1498 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1499 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1500 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1501
1502 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1503 interruptible,
1504 urgency, deadline, leeway,
1505 thread);
1506 waitq_unlock(waitq);
1507 splx(s);
1508 return wresult;
1509 }
1510
1511 void
sched_cond_init(sched_cond_atomic_t * cond)1512 sched_cond_init(
1513 sched_cond_atomic_t *cond)
1514 {
1515 os_atomic_init(cond, SCHED_COND_INIT);
1516 }
1517
1518 wait_result_t
sched_cond_wait_parameter(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation,void * parameter)1519 sched_cond_wait_parameter(
1520 sched_cond_atomic_t *cond,
1521 wait_interrupt_t interruptible,
1522 thread_continue_t continuation,
1523 void *parameter)
1524 {
1525 assert_wait((event_t) cond, interruptible);
1526 /* clear active bit to indicate future wakeups will have to unblock this thread */
1527 sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed);
1528 if (__improbable(new_state & SCHED_COND_WAKEUP)) {
1529 /* a wakeup has been issued; undo wait assertion, ack the wakeup, and return */
1530 thread_t thread = current_thread();
1531 clear_wait(thread, THREAD_AWAKENED);
1532 sched_cond_ack(cond);
1533 return THREAD_AWAKENED;
1534 }
1535 return thread_block_parameter(continuation, parameter);
1536 }
1537
1538 wait_result_t
sched_cond_wait(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation)1539 sched_cond_wait(
1540 sched_cond_atomic_t *cond,
1541 wait_interrupt_t interruptible,
1542 thread_continue_t continuation)
1543 {
1544 return sched_cond_wait_parameter(cond, interruptible, continuation, NULL);
1545 }
1546
1547 sched_cond_t
sched_cond_ack(sched_cond_atomic_t * cond)1548 sched_cond_ack(
1549 sched_cond_atomic_t *cond)
1550 {
1551 sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE | SCHED_COND_WAKEUP, acquire);
1552 assert(new_cond & SCHED_COND_ACTIVE);
1553 return new_cond;
1554 }
1555
1556 kern_return_t
sched_cond_signal(sched_cond_atomic_t * cond,thread_t thread)1557 sched_cond_signal(
1558 sched_cond_atomic_t *cond,
1559 thread_t thread)
1560 {
1561 disable_preemption();
1562 sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release);
1563 if (!(old_cond & (SCHED_COND_WAKEUP | SCHED_COND_ACTIVE))) {
1564 /* this was the first wakeup to be issued AND the thread was inactive */
1565 thread_wakeup_thread((event_t) cond, thread);
1566 }
1567 enable_preemption();
1568 return KERN_SUCCESS;
1569 }
1570
1571 /*
1572 * thread_isoncpu:
1573 *
1574 * Return TRUE if a thread is running on a processor such that an AST
1575 * is needed to pull it out of userspace execution, or if executing in
1576 * the kernel, bring to a context switch boundary that would cause
1577 * thread state to be serialized in the thread PCB.
1578 *
1579 * Thread locked, returns the same way. While locked, fields
1580 * like "state" cannot change. "runq" can change only from set to unset.
1581 */
1582 static inline boolean_t
thread_isoncpu(thread_t thread)1583 thread_isoncpu(thread_t thread)
1584 {
1585 /* Not running or runnable */
1586 if (!(thread->state & TH_RUN)) {
1587 return FALSE;
1588 }
1589
1590 /* Waiting on a runqueue, not currently running */
1591 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1592 /* TODO: This can also be incorrect for `handoff` cases where
1593 * the thread is never enqueued on the runq */
1594 if (thread_get_runq(thread) != PROCESSOR_NULL) {
1595 return FALSE;
1596 }
1597
1598 /*
1599 * Thread does not have a stack yet
1600 * It could be on the stack alloc queue or preparing to be invoked
1601 */
1602 if (!thread->kernel_stack) {
1603 return FALSE;
1604 }
1605
1606 /*
1607 * Thread must be running on a processor, or
1608 * about to run, or just did run. In all these
1609 * cases, an AST to the processor is needed
1610 * to guarantee that the thread is kicked out
1611 * of userspace and the processor has
1612 * context switched (and saved register state).
1613 */
1614 return TRUE;
1615 }
1616
1617 /*
1618 * thread_stop:
1619 *
1620 * Force a preemption point for a thread and wait
1621 * for it to stop running on a CPU. If a stronger
1622 * guarantee is requested, wait until no longer
1623 * runnable. Arbitrates access among
1624 * multiple stop requests. (released by unstop)
1625 *
1626 * The thread must enter a wait state and stop via a
1627 * separate means.
1628 *
1629 * Returns FALSE if interrupted.
1630 */
1631 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1632 thread_stop(
1633 thread_t thread,
1634 boolean_t until_not_runnable)
1635 {
1636 wait_result_t wresult;
1637 spl_t s = splsched();
1638 boolean_t oncpu;
1639
1640 wake_lock(thread);
1641 thread_lock(thread);
1642
1643 while (thread->state & TH_SUSP) {
1644 thread->wake_active = TRUE;
1645 thread_unlock(thread);
1646
1647 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1648 wake_unlock(thread);
1649 splx(s);
1650
1651 if (wresult == THREAD_WAITING) {
1652 wresult = thread_block(THREAD_CONTINUE_NULL);
1653 }
1654
1655 if (wresult != THREAD_AWAKENED) {
1656 return FALSE;
1657 }
1658
1659 s = splsched();
1660 wake_lock(thread);
1661 thread_lock(thread);
1662 }
1663
1664 thread->state |= TH_SUSP;
1665
1666 while ((oncpu = thread_isoncpu(thread)) ||
1667 (until_not_runnable && (thread->state & TH_RUN))) {
1668 if (oncpu) {
1669 /*
1670 * TODO: chosen_processor isn't really the right
1671 * thing to IPI here. We really want `last_processor`,
1672 * but we also want to know where to send the IPI
1673 * *before* thread_invoke sets last_processor.
1674 *
1675 * rdar://47149497 (thread_stop doesn't IPI the right core)
1676 */
1677 assert(thread->state & TH_RUN);
1678 processor_t processor = thread->chosen_processor;
1679 assert(processor != PROCESSOR_NULL);
1680 cause_ast_check(processor);
1681 }
1682
1683 thread->wake_active = TRUE;
1684 thread_unlock(thread);
1685
1686 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1687 wake_unlock(thread);
1688 splx(s);
1689
1690 if (wresult == THREAD_WAITING) {
1691 wresult = thread_block(THREAD_CONTINUE_NULL);
1692 }
1693
1694 if (wresult != THREAD_AWAKENED) {
1695 thread_unstop(thread);
1696 return FALSE;
1697 }
1698
1699 s = splsched();
1700 wake_lock(thread);
1701 thread_lock(thread);
1702 }
1703
1704 thread_unlock(thread);
1705 wake_unlock(thread);
1706 splx(s);
1707
1708 /*
1709 * We return with the thread unlocked. To prevent it from
1710 * transitioning to a runnable state (or from TH_RUN to
1711 * being on the CPU), the caller must ensure the thread
1712 * is stopped via an external means (such as an AST)
1713 */
1714
1715 return TRUE;
1716 }
1717
1718 /*
1719 * thread_unstop:
1720 *
1721 * Release a previous stop request and set
1722 * the thread running if appropriate.
1723 *
1724 * Use only after a successful stop operation.
1725 */
1726 void
thread_unstop(thread_t thread)1727 thread_unstop(
1728 thread_t thread)
1729 {
1730 spl_t s = splsched();
1731
1732 wake_lock(thread);
1733 thread_lock(thread);
1734
1735 assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1736
1737 if (thread->state & TH_SUSP) {
1738 thread->state &= ~TH_SUSP;
1739
1740 if (thread->wake_active) {
1741 thread->wake_active = FALSE;
1742 thread_unlock(thread);
1743
1744 thread_wakeup(&thread->wake_active);
1745 wake_unlock(thread);
1746 splx(s);
1747
1748 return;
1749 }
1750 }
1751
1752 thread_unlock(thread);
1753 wake_unlock(thread);
1754 splx(s);
1755 }
1756
1757 /*
1758 * thread_wait:
1759 *
1760 * Wait for a thread to stop running. (non-interruptible)
1761 *
1762 */
1763 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1764 thread_wait(
1765 thread_t thread,
1766 boolean_t until_not_runnable)
1767 {
1768 wait_result_t wresult;
1769 boolean_t oncpu;
1770 processor_t processor;
1771 spl_t s = splsched();
1772
1773 wake_lock(thread);
1774 thread_lock(thread);
1775
1776 /*
1777 * Wait until not running on a CPU. If stronger requirement
1778 * desired, wait until not runnable. Assumption: if thread is
1779 * on CPU, then TH_RUN is set, so we're not waiting in any case
1780 * where the original, pure "TH_RUN" check would have let us
1781 * finish.
1782 */
1783 while ((oncpu = thread_isoncpu(thread)) ||
1784 (until_not_runnable && (thread->state & TH_RUN))) {
1785 if (oncpu) {
1786 assert(thread->state & TH_RUN);
1787 processor = thread->chosen_processor;
1788 cause_ast_check(processor);
1789 }
1790
1791 thread->wake_active = TRUE;
1792 thread_unlock(thread);
1793
1794 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1795 wake_unlock(thread);
1796 splx(s);
1797
1798 if (wresult == THREAD_WAITING) {
1799 thread_block(THREAD_CONTINUE_NULL);
1800 }
1801
1802 s = splsched();
1803 wake_lock(thread);
1804 thread_lock(thread);
1805 }
1806
1807 thread_unlock(thread);
1808 wake_unlock(thread);
1809 splx(s);
1810 }
1811
1812 /*
1813 * Routine: clear_wait_internal
1814 *
1815 * Clear the wait condition for the specified thread.
1816 * Start the thread executing if that is appropriate.
1817 * Arguments:
1818 * thread thread to awaken
1819 * result Wakeup result the thread should see
1820 * Conditions:
1821 * At splsched
1822 * the thread is locked.
1823 * Returns:
1824 * KERN_SUCCESS thread was rousted out a wait
1825 * KERN_FAILURE thread was waiting but could not be rousted
1826 * KERN_NOT_WAITING thread was not waiting
1827 */
1828 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1829 clear_wait_internal(
1830 thread_t thread,
1831 wait_result_t wresult)
1832 {
1833 waitq_t waitq = thread->waitq;
1834
1835 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1836 return KERN_FAILURE;
1837 }
1838
1839 /*
1840 * Check that the thread is waiting and not waking, as a waking thread
1841 * has already cleared its waitq, and is destined to be go'ed, don't
1842 * need to do it again.
1843 */
1844 if ((thread->state & (TH_WAIT | TH_TERMINATE | TH_WAKING)) != TH_WAIT) {
1845 assert(waitq_is_null(thread->waitq));
1846 return KERN_NOT_WAITING;
1847 }
1848
1849 /* may drop and retake the thread lock */
1850 if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1851 return KERN_NOT_WAITING;
1852 }
1853
1854 thread_go(thread, wresult, /* handoff */ false);
1855
1856 return KERN_SUCCESS;
1857 }
1858
1859
1860 /*
1861 * clear_wait:
1862 *
1863 * Clear the wait condition for the specified thread. Start the thread
1864 * executing if that is appropriate.
1865 *
1866 * parameters:
1867 * thread thread to awaken
1868 * result Wakeup result the thread should see
1869 */
1870 kern_return_t
clear_wait(thread_t thread,wait_result_t result)1871 clear_wait(
1872 thread_t thread,
1873 wait_result_t result)
1874 {
1875 kern_return_t ret;
1876 spl_t s;
1877
1878 s = splsched();
1879 thread_lock(thread);
1880
1881 ret = clear_wait_internal(thread, result);
1882
1883 if (thread == current_thread()) {
1884 /*
1885 * The thread must be ready to wait again immediately
1886 * after clearing its own wait.
1887 */
1888 assert((thread->state & TH_WAKING) == 0);
1889 }
1890
1891 thread_unlock(thread);
1892 splx(s);
1893 return ret;
1894 }
1895
1896 /*
1897 * thread_wakeup_prim:
1898 *
1899 * Common routine for thread_wakeup, thread_wakeup_with_result,
1900 * and thread_wakeup_one.
1901 *
1902 */
1903 kern_return_t
thread_wakeup_nthreads_prim(event_t event,uint32_t nthreads,wait_result_t result)1904 thread_wakeup_nthreads_prim(
1905 event_t event,
1906 uint32_t nthreads,
1907 wait_result_t result)
1908 {
1909 if (__improbable(event == NO_EVENT)) {
1910 panic("%s() called with NO_EVENT", __func__);
1911 }
1912
1913 struct waitq *wq = global_eventq(event);
1914 uint32_t count;
1915
1916 count = waitq_wakeup64_nthreads(wq, CAST_EVENT64_T(event), result,
1917 WAITQ_WAKEUP_DEFAULT, nthreads);
1918 return count ? KERN_SUCCESS : KERN_NOT_WAITING;
1919 }
1920
1921 /*
1922 * thread_wakeup_prim:
1923 *
1924 * Common routine for thread_wakeup, thread_wakeup_with_result,
1925 * and thread_wakeup_one.
1926 *
1927 */
1928 kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1929 thread_wakeup_prim(
1930 event_t event,
1931 boolean_t one_thread,
1932 wait_result_t result)
1933 {
1934 if (one_thread) {
1935 return thread_wakeup_nthreads_prim(event, 1, result);
1936 } else {
1937 return thread_wakeup_nthreads_prim(event, UINT32_MAX, result);
1938 }
1939 }
1940
1941 /*
1942 * Wakeup a specified thread if and only if it's waiting for this event
1943 */
1944 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1945 thread_wakeup_thread(
1946 event_t event,
1947 thread_t thread)
1948 {
1949 if (__improbable(event == NO_EVENT)) {
1950 panic("%s() called with NO_EVENT", __func__);
1951 }
1952
1953 if (__improbable(thread == THREAD_NULL)) {
1954 panic("%s() called with THREAD_NULL", __func__);
1955 }
1956
1957 struct waitq *wq = global_eventq(event);
1958
1959 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1960 }
1961
1962 /*
1963 * thread_bind:
1964 *
1965 * Force the current thread to execute on the specified processor.
1966 * Takes effect after the next thread_block().
1967 *
1968 * Returns the previous binding. PROCESSOR_NULL means
1969 * not bound.
1970 *
1971 * XXX - DO NOT export this to users - XXX
1972 */
1973 processor_t
thread_bind(processor_t processor)1974 thread_bind(
1975 processor_t processor)
1976 {
1977 thread_t self = current_thread();
1978 processor_t prev;
1979 spl_t s;
1980
1981 s = splsched();
1982 thread_lock(self);
1983
1984 prev = thread_bind_internal(self, processor);
1985
1986 thread_unlock(self);
1987 splx(s);
1988
1989 return prev;
1990 }
1991
1992 void
thread_bind_during_wakeup(thread_t thread,processor_t processor)1993 thread_bind_during_wakeup(thread_t thread, processor_t processor)
1994 {
1995 assert(!ml_get_interrupts_enabled());
1996 assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1997 #if MACH_ASSERT
1998 thread_lock_assert(thread, LCK_ASSERT_OWNED);
1999 #endif
2000
2001 if (thread->bound_processor != processor) {
2002 thread_bind_internal(thread, processor);
2003 }
2004 }
2005
2006 void
thread_unbind_after_queue_shutdown(thread_t thread,processor_t processor __assert_only)2007 thread_unbind_after_queue_shutdown(
2008 thread_t thread,
2009 processor_t processor __assert_only)
2010 {
2011 assert(!ml_get_interrupts_enabled());
2012
2013 thread_lock(thread);
2014
2015 if (thread->bound_processor) {
2016 bool removed;
2017
2018 assert(thread->bound_processor == processor);
2019
2020 removed = thread_run_queue_remove(thread);
2021 /*
2022 * we can always unbind even if we didn't really remove the
2023 * thread from the runqueue
2024 */
2025 thread_bind_internal(thread, PROCESSOR_NULL);
2026 if (removed) {
2027 thread_run_queue_reinsert(thread, SCHED_TAILQ);
2028 }
2029 }
2030
2031 thread_unlock(thread);
2032 }
2033
2034 /*
2035 * thread_bind_internal:
2036 *
2037 * If the specified thread is not the current thread, and it is currently
2038 * running on another CPU, a remote AST must be sent to that CPU to cause
2039 * the thread to migrate to its bound processor. Otherwise, the migration
2040 * will occur at the next quantum expiration or blocking point.
2041 *
2042 * When the thread is the current thread, and explicit thread_block() should
2043 * be used to force the current processor to context switch away and
2044 * let the thread migrate to the bound processor.
2045 *
2046 * Thread must be locked, and at splsched.
2047 */
2048
2049 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)2050 thread_bind_internal(
2051 thread_t thread,
2052 processor_t processor)
2053 {
2054 processor_t prev;
2055
2056 /* <rdar://problem/15102234> */
2057 assert(thread->sched_pri < BASEPRI_RTQUEUES);
2058 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
2059 thread_assert_runq_null(thread);
2060
2061 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND),
2062 thread_tid(thread), processor ? processor->cpu_id : ~0ul, 0, 0, 0);
2063
2064 prev = thread->bound_processor;
2065 thread->bound_processor = processor;
2066
2067 return prev;
2068 }
2069
2070 /*
2071 * thread_vm_bind_group_add:
2072 *
2073 * The "VM bind group" is a special mechanism to mark a collection
2074 * of threads from the VM subsystem that, in general, should be scheduled
2075 * with only one CPU of parallelism. To accomplish this, we initially
2076 * bind all the threads to the master processor, which has the effect
2077 * that only one of the threads in the group can execute at once, including
2078 * preempting threads in the group that are a lower priority. Future
2079 * mechanisms may use more dynamic mechanisms to prevent the collection
2080 * of VM threads from using more CPU time than desired.
2081 *
2082 * The current implementation can result in priority inversions where
2083 * compute-bound priority 95 or realtime threads that happen to have
2084 * landed on the master processor prevent the VM threads from running.
2085 * When this situation is detected, we unbind the threads for one
2086 * scheduler tick to allow the scheduler to run the threads an
2087 * additional CPUs, before restoring the binding (assuming high latency
2088 * is no longer a problem).
2089 */
2090
2091 /*
2092 * The current max is provisioned for:
2093 * vm_compressor_swap_trigger_thread (92)
2094 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
2095 * vm_pageout_continue (92)
2096 * memorystatus_thread (95)
2097 */
2098 #define MAX_VM_BIND_GROUP_COUNT (5)
2099 decl_simple_lock_data(static, sched_vm_group_list_lock);
2100 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
2101 static int sched_vm_group_thread_count;
2102 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
2103
2104 void
thread_vm_bind_group_add(void)2105 thread_vm_bind_group_add(void)
2106 {
2107 thread_t self = current_thread();
2108
2109 if (support_bootcpu_shutdown) {
2110 /*
2111 * Bind group is not supported without an always-on
2112 * processor to bind to. If we need these to coexist,
2113 * we'd need to dynamically move the group to
2114 * another processor as it shuts down, or build
2115 * a different way to run a set of threads
2116 * without parallelism.
2117 */
2118 return;
2119 }
2120
2121 thread_reference(self);
2122 self->options |= TH_OPT_SCHED_VM_GROUP;
2123
2124 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
2125 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
2126 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
2127 simple_unlock(&sched_vm_group_list_lock);
2128
2129 thread_bind(master_processor);
2130
2131 /* Switch to bound processor if not already there */
2132 thread_block(THREAD_CONTINUE_NULL);
2133 }
2134
2135 static void
sched_vm_group_maintenance(void)2136 sched_vm_group_maintenance(void)
2137 {
2138 uint64_t ctime = mach_absolute_time();
2139 uint64_t longtime = ctime - sched_tick_interval;
2140 int i;
2141 spl_t s;
2142 boolean_t high_latency_observed = FALSE;
2143 boolean_t runnable_and_not_on_runq_observed = FALSE;
2144 boolean_t bind_target_changed = FALSE;
2145 processor_t bind_target = PROCESSOR_NULL;
2146
2147 /* Make sure nobody attempts to add new threads while we are enumerating them */
2148 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
2149
2150 s = splsched();
2151
2152 for (i = 0; i < sched_vm_group_thread_count; i++) {
2153 thread_t thread = sched_vm_group_thread_list[i];
2154 assert(thread != THREAD_NULL);
2155 thread_lock(thread);
2156 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
2157 if (thread_get_runq(thread) != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
2158 high_latency_observed = TRUE;
2159 } else if (thread_get_runq(thread) == PROCESSOR_NULL) {
2160 /* There are some cases where a thread be transitiong that also fall into this case */
2161 runnable_and_not_on_runq_observed = TRUE;
2162 }
2163 }
2164 thread_unlock(thread);
2165
2166 if (high_latency_observed && runnable_and_not_on_runq_observed) {
2167 /* All the things we are looking for are true, stop looking */
2168 break;
2169 }
2170 }
2171
2172 splx(s);
2173
2174 if (sched_vm_group_temporarily_unbound) {
2175 /* If we turned off binding, make sure everything is OK before rebinding */
2176 if (!high_latency_observed) {
2177 /* rebind */
2178 bind_target_changed = TRUE;
2179 bind_target = master_processor;
2180 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
2181 }
2182 } else {
2183 /*
2184 * Check if we're in a bad state, which is defined by high
2185 * latency with no core currently executing a thread. If a
2186 * single thread is making progress on a CPU, that means the
2187 * binding concept to reduce parallelism is working as
2188 * designed.
2189 */
2190 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
2191 /* unbind */
2192 bind_target_changed = TRUE;
2193 bind_target = PROCESSOR_NULL;
2194 sched_vm_group_temporarily_unbound = TRUE;
2195 }
2196 }
2197
2198 if (bind_target_changed) {
2199 s = splsched();
2200 for (i = 0; i < sched_vm_group_thread_count; i++) {
2201 thread_t thread = sched_vm_group_thread_list[i];
2202 boolean_t removed;
2203 assert(thread != THREAD_NULL);
2204
2205 thread_lock(thread);
2206 removed = thread_run_queue_remove(thread);
2207 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
2208 thread_bind_internal(thread, bind_target);
2209 } else {
2210 /*
2211 * Thread was in the middle of being context-switched-to,
2212 * or was in the process of blocking. To avoid switching the bind
2213 * state out mid-flight, defer the change if possible.
2214 */
2215 if (bind_target == PROCESSOR_NULL) {
2216 thread_bind_internal(thread, bind_target);
2217 } else {
2218 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
2219 }
2220 }
2221
2222 if (removed) {
2223 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
2224 }
2225 thread_unlock(thread);
2226 }
2227 splx(s);
2228 }
2229
2230 simple_unlock(&sched_vm_group_list_lock);
2231 }
2232
2233 #if defined(__x86_64__)
2234 #define SCHED_AVOID_CPU0 1
2235 #else
2236 #define SCHED_AVOID_CPU0 0
2237 #endif
2238
2239 int sched_allow_rt_smt = 1;
2240 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
2241 int sched_allow_rt_steal = 1;
2242 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
2243
2244 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
2245
2246 int
sched_get_rt_n_backup_processors(void)2247 sched_get_rt_n_backup_processors(void)
2248 {
2249 return sched_rt_n_backup_processors;
2250 }
2251
2252 void
sched_set_rt_n_backup_processors(int n)2253 sched_set_rt_n_backup_processors(int n)
2254 {
2255 if (n < 0) {
2256 n = 0;
2257 } else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
2258 n = SCHED_MAX_BACKUP_PROCESSORS;
2259 }
2260
2261 sched_rt_n_backup_processors = n;
2262 }
2263
2264 int sched_rt_runq_strict_priority = false;
2265
2266 inline static processor_set_t
change_locked_pset(processor_set_t current_pset,processor_set_t new_pset)2267 change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
2268 {
2269 if (current_pset != new_pset) {
2270 pset_unlock(current_pset);
2271 pset_lock(new_pset);
2272 }
2273
2274 return new_pset;
2275 }
2276
2277 /*
2278 * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2279 * rebalancing opportunity exists when a core is (instantaneously) idle, but
2280 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
2281 * IPI thrash if this core does not remain idle following the load balancing ASTs
2282 * Idle "thrash", when IPI issue is followed by idle entry/core power down
2283 * followed by a wakeup shortly thereafter.
2284 */
2285
2286 #if (DEVELOPMENT || DEBUG)
2287 int sched_smt_balance = 1;
2288 #endif
2289
2290 #if CONFIG_SCHED_SMT
2291 /* Invoked with pset locked, returns with pset unlocked */
2292 bool
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)2293 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2294 {
2295 processor_t ast_processor = NULL;
2296
2297 #if (DEVELOPMENT || DEBUG)
2298 if (__improbable(sched_smt_balance == 0)) {
2299 goto smt_balance_exit;
2300 }
2301 #endif
2302
2303 assert(cprocessor == current_processor());
2304 if (cprocessor->is_SMT == FALSE) {
2305 goto smt_balance_exit;
2306 }
2307
2308 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2309
2310 /* Determine if both this processor and its sibling are idle,
2311 * indicating an SMT rebalancing opportunity.
2312 */
2313 if (sib_processor->state != PROCESSOR_IDLE) {
2314 goto smt_balance_exit;
2315 }
2316
2317 processor_t sprocessor;
2318
2319 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2320 uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2321 ~cpset->primary_map);
2322 for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2323 sprocessor = processor_array[cpuid];
2324 if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2325 (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2326 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2327 if (ipi_type != SCHED_IPI_NONE) {
2328 assert(sprocessor != cprocessor);
2329 ast_processor = sprocessor;
2330 break;
2331 }
2332 }
2333 }
2334
2335 smt_balance_exit:
2336 pset_unlock(cpset);
2337
2338 if (ast_processor) {
2339 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2340 sched_ipi_perform(ast_processor, ipi_type);
2341 }
2342 return false;
2343 }
2344 #else /* CONFIG_SCHED_SMT */
2345 /* Invoked with pset locked, returns with pset unlocked */
2346 bool
sched_SMT_balance(__unused processor_t cprocessor,__unused processor_set_t cpset)2347 sched_SMT_balance(__unused processor_t cprocessor, __unused processor_set_t cpset)
2348 {
2349 pset_unlock(cpset);
2350 return false;
2351 }
2352 #endif /* CONFIG_SCHED_SMT */
2353
2354
2355 static cpumap_t
pset_available_cpumap(processor_set_t pset)2356 pset_available_cpumap(processor_set_t pset)
2357 {
2358 return pset->cpu_available_map & pset->recommended_bitmask;
2359 }
2360
2361 int
pset_available_cpu_count(processor_set_t pset)2362 pset_available_cpu_count(processor_set_t pset)
2363 {
2364 return bit_count(pset_available_cpumap(pset));
2365 }
2366
2367 bool
pset_is_recommended(processor_set_t pset)2368 pset_is_recommended(processor_set_t pset)
2369 {
2370 if (!pset) {
2371 return false;
2372 }
2373 return pset_available_cpu_count(pset) > 0;
2374 }
2375
2376 bool
pset_type_is_recommended(processor_set_t pset)2377 pset_type_is_recommended(processor_set_t pset)
2378 {
2379 if (!pset) {
2380 return false;
2381 }
2382 pset_map_t recommended_psets = os_atomic_load(&pset->node->pset_recommended_map, relaxed);
2383 return bit_count(recommended_psets) > 0;
2384 }
2385
2386 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2387 pset_available_but_not_running_cpumap(processor_set_t pset)
2388 {
2389 return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2390 pset->recommended_bitmask;
2391 }
2392
2393 bool
pset_has_stealable_threads(processor_set_t pset)2394 pset_has_stealable_threads(processor_set_t pset)
2395 {
2396 pset_assert_locked(pset);
2397
2398 cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2399 #if CONFIG_SCHED_SMT
2400 /*
2401 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2402 * available primary CPUs
2403 */
2404 avail_map &= pset->primary_map;
2405 #endif /* CONFIG_SCHED_SMT */
2406
2407 return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2408 }
2409
2410 static cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)2411 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2412 {
2413 cpumap_t avail_map = pset_available_cpumap(pset);
2414 #if CONFIG_SCHED_SMT
2415 if (!sched_allow_rt_smt) {
2416 /*
2417 * Secondary CPUs are not allowed to run RT threads, so
2418 * only primary CPUs should be included
2419 */
2420 avail_map &= pset->primary_map;
2421 }
2422 #endif /* CONFIG_SCHED_SMT */
2423
2424 return avail_map & ~pset->realtime_map;
2425 }
2426
2427 static bool
pset_needs_a_followup_IPI(processor_set_t pset)2428 pset_needs_a_followup_IPI(processor_set_t pset)
2429 {
2430 int nbackup_cpus = 0;
2431
2432 if (rt_runq_is_low_latency(pset)) {
2433 nbackup_cpus = sched_rt_n_backup_processors;
2434 }
2435
2436 int rt_rq_count = rt_runq_count(pset);
2437
2438 return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
2439 }
2440
2441 bool
pset_has_stealable_rt_threads(processor_set_t pset)2442 pset_has_stealable_rt_threads(processor_set_t pset)
2443 {
2444 pset_node_t node = pset->node;
2445 if (bit_count(node->pset_map) == 1) {
2446 return false;
2447 }
2448
2449 cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2450
2451 return rt_runq_count(pset) > bit_count(avail_map);
2452 }
2453
2454 static void
pset_update_rt_stealable_state(processor_set_t pset)2455 pset_update_rt_stealable_state(processor_set_t pset)
2456 {
2457 if (pset_has_stealable_rt_threads(pset)) {
2458 pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2459 } else {
2460 pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2461 }
2462 }
2463
2464 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2465 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2466 {
2467 /* Acknowledge any pending IPIs here with pset lock held */
2468 pset_assert_locked(pset);
2469 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2470 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2471 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2472 }
2473 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2474
2475 #if defined(CONFIG_SCHED_DEFERRED_AST)
2476 bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2477 #endif
2478 }
2479
2480 /*
2481 * Called with pset locked, on a processor that is committing to run a new thread
2482 * Will transition an idle or dispatching processor to running as it picks up
2483 * the first new thread from the idle thread.
2484 */
2485 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2486 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2487 {
2488 pset_assert_locked(pset);
2489
2490 if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2491 assert(current_thread() == processor->idle_thread);
2492
2493 /*
2494 * Dispatching processor is now committed to running new_thread,
2495 * so change its state to PROCESSOR_RUNNING.
2496 */
2497 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2498 } else {
2499 assert(processor->state == PROCESSOR_RUNNING);
2500 }
2501
2502 processor_state_update_from_thread(processor, new_thread, true);
2503
2504 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2505 bit_set(pset->realtime_map, processor->cpu_id);
2506 } else {
2507 bit_clear(pset->realtime_map, processor->cpu_id);
2508 }
2509 pset_update_rt_stealable_state(pset);
2510
2511 pset_node_t node = pset->node;
2512
2513 if (bit_count(node->pset_map) == 1) {
2514 /* Node has only a single pset, so skip node pset map updates */
2515 return;
2516 }
2517
2518 cpumap_t avail_map = pset_available_cpumap(pset);
2519
2520 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2521 if ((avail_map & pset->realtime_map) == avail_map) {
2522 /* No more non-RT CPUs in this pset */
2523 atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2524 }
2525 #if CONFIG_SCHED_SMT
2526 avail_map &= pset->primary_map;
2527 if ((avail_map & pset->realtime_map) == avail_map) {
2528 /* No more non-RT primary CPUs in this pset */
2529 atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2530 }
2531 #endif /* CONFIG_SCHED_SMT */
2532 } else {
2533 if ((avail_map & pset->realtime_map) != avail_map) {
2534 if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2535 atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2536 }
2537 }
2538 #if CONFIG_SCHED_SMT
2539 avail_map &= pset->primary_map;
2540 if ((avail_map & pset->realtime_map) != avail_map) {
2541 if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2542 atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2543 }
2544 }
2545 #endif /* CONFIG_SCHED_SMT */
2546 }
2547 }
2548
2549 #if CONFIG_SCHED_SMT
2550 static processor_t choose_processor_for_realtime_thread_smt(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2551 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2552 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2553 #else /* CONFIG_SCHED_SMT */
2554 static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills);
2555 #endif /* CONFIG_SCHED_SMT */
2556 static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2557 processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2558 static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2559 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2560 static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2561
2562 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2563 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2564 {
2565 pset_map_t pset_map = stealing_pset->node->pset_map;
2566
2567 bit_clear(pset_map, stealing_pset->pset_id);
2568
2569 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2570 processor_set_t nset = pset_array[pset_id];
2571
2572 if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) {
2573 return true;
2574 }
2575 }
2576
2577 return false;
2578 }
2579
2580 /*
2581 * starting_pset must be locked, but returns true if it is unlocked before return
2582 */
2583 static bool
choose_next_rt_processor_for_IPI(processor_set_t starting_pset,processor_t chosen_processor,bool spill_ipi,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)2584 choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2585 processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2586 {
2587 bool starting_pset_is_unlocked = false;
2588 uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
2589 int max_pri = rt_runq_priority(starting_pset);
2590 __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
2591 processor_set_t pset = starting_pset;
2592 processor_t next_rt_processor = PROCESSOR_NULL;
2593 if (spill_ipi) {
2594 processor_set_t nset = next_pset(pset);
2595 assert(nset != starting_pset);
2596 pset = change_locked_pset(pset, nset);
2597 starting_pset_is_unlocked = true;
2598 }
2599 do {
2600 const bool consider_secondaries = true;
2601 next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries);
2602 if (next_rt_processor == PROCESSOR_NULL) {
2603 if (!spill_ipi) {
2604 break;
2605 }
2606 processor_set_t nset = next_pset(pset);
2607 if (nset == starting_pset) {
2608 break;
2609 }
2610 pset = change_locked_pset(pset, nset);
2611 starting_pset_is_unlocked = true;
2612 }
2613 } while (next_rt_processor == PROCESSOR_NULL);
2614 if (next_rt_processor) {
2615 if (pset != starting_pset) {
2616 if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2617 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2618 next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2619 }
2620 }
2621 *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
2622 *result_processor = next_rt_processor;
2623 }
2624 if (pset != starting_pset) {
2625 pset_unlock(pset);
2626 }
2627
2628 return starting_pset_is_unlocked;
2629 }
2630
2631 /*
2632 * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2633 * followup processor - used in thread_select when there are still threads on the run queue and available processors
2634 * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2635 */
2636 typedef enum {
2637 none,
2638 backup,
2639 followup,
2640 spill
2641 } next_processor_type_t;
2642
2643 #undef LOOP_COUNT
2644 #ifdef LOOP_COUNT
2645 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2646 #endif
2647
2648 /*
2649 * thread_select:
2650 *
2651 * Select a new thread for the current processor to execute.
2652 *
2653 * May select the current thread, which must be locked.
2654 */
2655 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2656 thread_select(thread_t thread,
2657 processor_t processor,
2658 ast_t *reason)
2659 {
2660 processor_set_t pset = processor->processor_set;
2661 thread_t new_thread = THREAD_NULL;
2662
2663 assert(processor == current_processor());
2664 assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2665
2666 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2667 0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2668
2669 __kdebug_only int idle_reason = 0;
2670 __kdebug_only int delay_count = 0;
2671
2672 #if CONFIG_SCHED_SMT
2673 int timeout_count = sched_backup_cpu_timeout_count;
2674 if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2675 /* Prefer cpu0 as backup */
2676 timeout_count--;
2677 } else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2678 /* Prefer secondary cpu as backup */
2679 timeout_count--;
2680 }
2681 #endif /* CONFIG_SCHED_SMT */
2682 bool pending_AST_URGENT = false;
2683 bool pending_AST_PREEMPT = false;
2684
2685 #ifdef LOOP_COUNT
2686 int loop_count = -1;
2687 #endif
2688
2689 do {
2690 /*
2691 * Update the priority.
2692 */
2693 if (SCHED(can_update_priority)(thread)) {
2694 SCHED(update_priority)(thread);
2695 }
2696
2697 pset_lock(pset);
2698
2699 restart:
2700 #ifdef LOOP_COUNT
2701 loop_count++;
2702 if (loop_count > max_loop_count[processor->cpu_id]) {
2703 max_loop_count[processor->cpu_id] = loop_count;
2704 if (bit_count(loop_count) == 1) {
2705 kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2706 }
2707 }
2708 #endif
2709 pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2710 pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2711
2712 processor_state_update_from_thread(processor, thread, true);
2713
2714 idle_reason = 0;
2715
2716 processor_t ast_processor = PROCESSOR_NULL;
2717 processor_t next_rt_processor = PROCESSOR_NULL;
2718 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2719 sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2720
2721 assert(processor->state != PROCESSOR_OFF_LINE);
2722
2723 /*
2724 * Bound threads are dispatched to a processor without going through
2725 * choose_processor(), so in those cases we must continue trying to dequeue work
2726 * as we are the only option.
2727 */
2728 if (!SCHED(processor_bound_count)(processor)) {
2729 if (!processor->is_recommended) {
2730 /*
2731 * The performance controller has provided a hint to not dispatch more threads,
2732 */
2733 idle_reason = 1;
2734 goto send_followup_ipi_before_idle;
2735 } else if (rt_runq_count(pset)) {
2736 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2737 /* Give the current RT thread a chance to complete */
2738 ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2739 #if CONFIG_SCHED_SMT
2740 /*
2741 * On Intel we want to avoid SMT secondary processors and processor 0
2742 * but allow them to be used as backup processors in case the preferred chosen
2743 * processor is delayed by interrupts or processor stalls. So if it is
2744 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2745 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2746 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2747 * to grab the thread before the (current) backup processor does.
2748 *
2749 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2750 * on DEVELOPMENT || DEBUG kernels. It is also adjusted (see above) depending on whether we want to use
2751 * cpu0 before secondary cpus or not.
2752 */
2753 if (!ok_to_run_realtime_thread) {
2754 if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2755 if (timeout_count-- > 0) {
2756 pset_unlock(pset);
2757 thread_unlock(thread);
2758 delay(10);
2759 delay_count++;
2760 thread_lock(thread);
2761 pset_lock(pset);
2762 goto restart;
2763 }
2764 ok_to_run_realtime_thread = true;
2765 }
2766 }
2767 #endif /* CONFIG_SCHED_SMT */
2768 if (!ok_to_run_realtime_thread) {
2769 idle_reason = 2;
2770 goto send_followup_ipi_before_idle;
2771 }
2772 }
2773 #if CONFIG_SCHED_SMT
2774 else if (processor->processor_primary != processor) {
2775 /*
2776 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2777 * we should look for work only under the same conditions that choose_processor()
2778 * would have assigned work, which is when all primary processors have been assigned work.
2779 */
2780 if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2781 /* There are idle primaries */
2782 idle_reason = 3;
2783 goto idle;
2784 }
2785 }
2786 #endif /* CONFIG_SCHED_SMT */
2787 }
2788
2789 /*
2790 * Test to see if the current thread should continue
2791 * to run on this processor. Must not be attempting to wait, and not
2792 * bound to a different processor, nor be in the wrong
2793 * processor set, nor be forced to context switch by TH_SUSP.
2794 *
2795 * Note that there are never any RT threads in the regular runqueue.
2796 *
2797 * This code is very insanely tricky.
2798 */
2799
2800 /* i.e. not waiting, not TH_SUSP'ed */
2801 bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2802
2803 /*
2804 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2805 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2806 * <rdar://problem/47907700>
2807 *
2808 * A yielding thread shouldn't be forced to context switch.
2809 */
2810
2811 bool is_yielding = (*reason & AST_YIELD) == AST_YIELD;
2812
2813 #if CONFIG_SCHED_SMT
2814 bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2815 #endif /* CONFIG_SCHED_SMT */
2816
2817 bool affinity_mismatch = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2818
2819 bool bound_elsewhere = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2820
2821 bool avoid_processor = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, *reason);
2822
2823 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2824
2825 bool current_thread_can_keep_running = (
2826 still_running
2827 #if CONFIG_SCHED_SMT
2828 && !needs_smt_rebalance
2829 #endif /* CONFIG_SCHED_SMT */
2830 && !affinity_mismatch
2831 && !bound_elsewhere
2832 && !avoid_processor);
2833 if (current_thread_can_keep_running) {
2834 /*
2835 * This thread is eligible to keep running on this processor.
2836 *
2837 * RT threads with un-expired quantum stay on processor,
2838 * unless there's a valid RT thread with an earlier deadline
2839 * and it is still ok_to_run_realtime_thread.
2840 */
2841 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2842 /*
2843 * Pick a new RT thread only if ok_to_run_realtime_thread
2844 * (but the current thread is allowed to complete).
2845 */
2846 if (ok_to_run_realtime_thread) {
2847 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2848 goto pick_new_rt_thread;
2849 }
2850 if (rt_runq_priority(pset) > thread->sched_pri) {
2851 if (sched_rt_runq_strict_priority) {
2852 /* The next RT thread is better, so pick it off the runqueue. */
2853 goto pick_new_rt_thread;
2854 }
2855
2856 /*
2857 * See if the current lower priority thread can continue to run without causing
2858 * the higher priority thread on the runq queue to miss its deadline.
2859 */
2860 thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2861 if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2862 /* The next RT thread is better, so pick it off the runqueue. */
2863 goto pick_new_rt_thread;
2864 }
2865 } else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2866 /* The next RT thread is better, so pick it off the runqueue. */
2867 goto pick_new_rt_thread;
2868 }
2869 if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2870 goto pick_new_rt_thread;
2871 }
2872 }
2873
2874 /* This is still the best RT thread to run. */
2875 processor->deadline = thread->realtime.deadline;
2876
2877 sched_update_pset_load_average(pset, 0);
2878
2879 clear_pending_AST_bits(pset, processor, 1);
2880
2881 next_rt_processor = PROCESSOR_NULL;
2882 next_rt_ipi_type = SCHED_IPI_NONE;
2883
2884 bool pset_unlocked = false;
2885 __kdebug_only next_processor_type_t nptype = none;
2886 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2887 nptype = spill;
2888 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2889 } else if (pset_needs_a_followup_IPI(pset)) {
2890 nptype = followup;
2891 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2892 }
2893 if (!pset_unlocked) {
2894 pset_unlock(pset);
2895 }
2896
2897 if (next_rt_processor) {
2898 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2899 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2900 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2901 }
2902
2903 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2904 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2905 return thread;
2906 }
2907
2908 if ((rt_runq_count(pset) == 0) &&
2909 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2910 /* This thread is still the highest priority runnable (non-idle) thread */
2911 processor->deadline = RT_DEADLINE_NONE;
2912
2913 sched_update_pset_load_average(pset, 0);
2914
2915 clear_pending_AST_bits(pset, processor, 2);
2916
2917 pset_unlock(pset);
2918
2919 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2920 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2921 return thread;
2922 }
2923 } else {
2924 /*
2925 * This processor must context switch.
2926 * If it's due to a rebalance, we should aggressively find this thread a new home.
2927 */
2928 bool ast_rebalance = affinity_mismatch || bound_elsewhere || avoid_processor;
2929 #if CONFIG_SCHED_SMT
2930 ast_rebalance = ast_rebalance || needs_smt_rebalance;
2931 #endif /* CONFIG_SCHED_SMT */
2932 if (ast_rebalance) {
2933 *reason |= AST_REBALANCE;
2934 }
2935 }
2936
2937 #if CONFIG_SCHED_SMT
2938 bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2939 (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2940 (processor->processor_secondary->state == PROCESSOR_IDLE));
2941 #endif /* CONFIG_SCHED_SMT */
2942
2943 /* OK, so we're not going to run the current thread. Look at the RT queue. */
2944 if (ok_to_run_realtime_thread) {
2945 pick_new_rt_thread:
2946 new_thread = sched_rt_choose_thread(pset);
2947 if (new_thread != THREAD_NULL) {
2948 processor->deadline = new_thread->realtime.deadline;
2949 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2950
2951 clear_pending_AST_bits(pset, processor, 3);
2952
2953 #if CONFIG_SCHED_SMT
2954 if (processor->processor_secondary != NULL) {
2955 processor_t sprocessor = processor->processor_secondary;
2956 if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2957 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2958 ast_processor = sprocessor;
2959 }
2960 }
2961 #endif /* CONFIG_SCHED_SMT */
2962 }
2963 }
2964
2965 send_followup_ipi_before_idle:
2966 /* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2967 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2968 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2969 }
2970 __kdebug_only next_processor_type_t nptype = none;
2971 bool pset_unlocked = false;
2972 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2973 nptype = spill;
2974 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2975 } else if (pset_needs_a_followup_IPI(pset)) {
2976 nptype = followup;
2977 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2978 }
2979
2980 assert(new_thread || !ast_processor);
2981 if (new_thread || next_rt_processor) {
2982 if (!pset_unlocked) {
2983 pset_unlock(pset);
2984 pset_unlocked = true;
2985 }
2986 if (ast_processor == next_rt_processor) {
2987 ast_processor = PROCESSOR_NULL;
2988 ipi_type = SCHED_IPI_NONE;
2989 }
2990
2991 if (ast_processor) {
2992 sched_ipi_perform(ast_processor, ipi_type);
2993 }
2994
2995 if (next_rt_processor) {
2996 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2997 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2998 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2999 }
3000
3001 if (new_thread) {
3002 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
3003 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
3004 return new_thread;
3005 }
3006 }
3007
3008 if (pset_unlocked) {
3009 pset_lock(pset);
3010 }
3011
3012 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
3013 /* Things changed while we dropped the lock */
3014 goto restart;
3015 }
3016
3017 if (processor->is_recommended) {
3018 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
3019 if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
3020 /* Things changed while we dropped the lock */
3021 goto restart;
3022 }
3023
3024 #if CONFIG_SCHED_SMT
3025 if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
3026 /* secondary can only run realtime thread */
3027 if (idle_reason == 0) {
3028 idle_reason = 4;
3029 }
3030 goto idle;
3031 }
3032 #endif /* CONFIG_SCHED_SMT */
3033 } else if (!SCHED(processor_bound_count)(processor)) {
3034 /* processor not recommended and no bound threads */
3035 if (idle_reason == 0) {
3036 idle_reason = 5;
3037 }
3038 goto idle;
3039 }
3040
3041 processor->deadline = RT_DEADLINE_NONE;
3042
3043 /* No RT threads, so let's look at the regular threads. */
3044 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, current_thread_can_keep_running ? thread : THREAD_NULL, *reason)) != THREAD_NULL) {
3045 if (new_thread != thread) {
3046 /* Going to context-switch */
3047 pset_commit_processor_to_new_thread(pset, processor, new_thread);
3048
3049 clear_pending_AST_bits(pset, processor, 4);
3050
3051 ast_processor = PROCESSOR_NULL;
3052 ipi_type = SCHED_IPI_NONE;
3053
3054 #if CONFIG_SCHED_SMT
3055 processor_t sprocessor = processor->processor_secondary;
3056 if (sprocessor != NULL) {
3057 if (sprocessor->state == PROCESSOR_RUNNING) {
3058 if (thread_no_smt(new_thread)) {
3059 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
3060 ast_processor = sprocessor;
3061 }
3062 } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
3063 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
3064 ast_processor = sprocessor;
3065 }
3066 }
3067 #endif /* CONFIG_SCHED_SMT */
3068
3069 pset_unlock(pset);
3070
3071 if (ast_processor) {
3072 sched_ipi_perform(ast_processor, ipi_type);
3073 }
3074 } else {
3075 /* Will continue running the current thread */
3076 clear_pending_AST_bits(pset, processor, 4);
3077 pset_unlock(pset);
3078 }
3079
3080 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
3081 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
3082 return new_thread;
3083 }
3084
3085 if (processor->must_idle) {
3086 processor->must_idle = false;
3087 *reason |= AST_REBALANCE;
3088 idle_reason = 6;
3089 goto idle;
3090 }
3091
3092 if (SCHED(steal_thread_enabled)(pset)
3093 #if CONFIG_SCHED_SMT
3094 && (processor->processor_primary == processor)
3095 #endif /* CONFIG_SCHED_SMT */
3096 ) {
3097 /*
3098 * No runnable threads, attempt to steal
3099 * from other processors. Returns with pset lock dropped.
3100 */
3101
3102 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
3103 pset_lock(pset);
3104 pset_commit_processor_to_new_thread(pset, processor, new_thread);
3105 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
3106 /*
3107 * A realtime thread choose this processor while it was DISPATCHING
3108 * and the pset lock was dropped
3109 */
3110 ast_on(AST_URGENT | AST_PREEMPT);
3111 }
3112
3113 clear_pending_AST_bits(pset, processor, 5);
3114
3115 pset_unlock(pset);
3116
3117 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
3118 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
3119 return new_thread;
3120 }
3121
3122 /*
3123 * If other threads have appeared, shortcut
3124 * around again.
3125 */
3126 if (SCHED(processor_bound_count)(processor)) {
3127 continue;
3128 }
3129 if (processor->is_recommended) {
3130 if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
3131 continue;
3132 }
3133 }
3134
3135 pset_lock(pset);
3136 }
3137
3138 idle:
3139 /* Someone selected this processor while we had dropped the lock */
3140 if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
3141 (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
3142 goto restart;
3143 }
3144
3145 if ((idle_reason == 0) && current_thread_can_keep_running) {
3146 /* This thread is the only runnable (non-idle) thread */
3147 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3148 processor->deadline = thread->realtime.deadline;
3149 } else {
3150 processor->deadline = RT_DEADLINE_NONE;
3151 }
3152
3153 sched_update_pset_load_average(pset, 0);
3154
3155 clear_pending_AST_bits(pset, processor, 6);
3156
3157 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
3158 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
3159 pset_unlock(pset);
3160 return thread;
3161 }
3162
3163 /*
3164 * Nothing is runnable, or this processor must be forced idle,
3165 * so set this processor idle if it was running.
3166 */
3167 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
3168 pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
3169 processor_state_update_idle(processor);
3170 }
3171 pset_update_rt_stealable_state(pset);
3172
3173 clear_pending_AST_bits(pset, processor, 7);
3174
3175 /* Invoked with pset locked, returns with pset unlocked */
3176 processor->next_idle_short = SCHED(processor_balance)(processor, pset);
3177
3178 new_thread = processor->idle_thread;
3179 } while (new_thread == THREAD_NULL);
3180
3181 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
3182 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
3183 return new_thread;
3184 }
3185
3186 /*
3187 * thread_invoke
3188 *
3189 * Called at splsched with neither thread locked.
3190 *
3191 * Perform a context switch and start executing the new thread.
3192 *
3193 * Returns FALSE when the context switch didn't happen.
3194 * The reference to the new thread is still consumed.
3195 *
3196 * "self" is what is currently running on the processor,
3197 * "thread" is the new thread to context switch to
3198 * (which may be the same thread in some cases)
3199 */
3200 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)3201 thread_invoke(
3202 thread_t self,
3203 thread_t thread,
3204 ast_t reason)
3205 {
3206 if (__improbable(get_preemption_level() != 0)) {
3207 int pl = get_preemption_level();
3208 panic("thread_invoke: preemption_level %d, possible cause: %s",
3209 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
3210 "blocking while holding a spinlock, or within interrupt context"));
3211 }
3212
3213 thread_continue_t continuation = self->continuation;
3214 void *parameter = self->parameter;
3215
3216 struct recount_snap snap = { 0 };
3217 recount_snapshot(&snap);
3218 uint64_t ctime = snap.rsn_time_mach;
3219
3220 check_monotonic_time(ctime);
3221
3222 #ifdef CONFIG_MACH_APPROXIMATE_TIME
3223 commpage_update_mach_approximate_time(ctime);
3224 #endif
3225
3226 if (ctime < thread->last_made_runnable_time) {
3227 panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
3228 ctime, thread->last_made_runnable_time);
3229 }
3230
3231 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3232 if (!((thread->state & TH_IDLE) != 0 ||
3233 ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
3234 sched_timeshare_consider_maintenance(ctime, true);
3235 }
3236 #endif
3237
3238 recount_log_switch_thread(&snap);
3239
3240 processor_t processor = current_processor();
3241
3242 if (!processor->processor_online) {
3243 panic("Invalid attempt to context switch an offline processor");
3244 }
3245
3246 assert_thread_magic(self);
3247 assert(self == current_thread());
3248 thread_assert_runq_null(self);
3249 assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
3250
3251 thread_lock(thread);
3252
3253 assert_thread_magic(thread);
3254 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
3255 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor);
3256 thread_assert_runq_null(thread);
3257
3258 /* Update SFI class based on other factors */
3259 thread->sfi_class = sfi_thread_classify(thread);
3260
3261 /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
3262 thread->same_pri_latency = ctime - thread->last_basepri_change_time;
3263 /*
3264 * In case a base_pri update happened between the timestamp and
3265 * taking the thread lock
3266 */
3267 if (ctime <= thread->last_basepri_change_time) {
3268 thread->same_pri_latency = ctime - thread->last_made_runnable_time;
3269 }
3270
3271 /* Allow realtime threads to hang onto a stack. */
3272 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
3273 self->reserved_stack = self->kernel_stack;
3274 }
3275
3276 /* Prepare for spin debugging */
3277 #if SCHED_HYGIENE_DEBUG
3278 ml_spin_debug_clear(thread);
3279 #endif
3280
3281 if (continuation != NULL) {
3282 if (!thread->kernel_stack) {
3283 /*
3284 * If we are using a privileged stack,
3285 * check to see whether we can exchange it with
3286 * that of the other thread.
3287 */
3288 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
3289 goto need_stack;
3290 }
3291
3292 /*
3293 * Context switch by performing a stack handoff.
3294 * Requires both threads to be parked in a continuation.
3295 */
3296 continuation = thread->continuation;
3297 parameter = thread->parameter;
3298
3299 processor->active_thread = thread;
3300 processor_state_update_from_thread(processor, thread, false);
3301
3302 if (thread->last_processor != processor && thread->last_processor != NULL) {
3303 if (thread->last_processor->processor_set != processor->processor_set) {
3304 thread->ps_switch++;
3305 }
3306 thread->p_switch++;
3307 }
3308 thread->last_processor = processor;
3309 thread->c_switch++;
3310 ast_context(thread);
3311
3312 thread_unlock(thread);
3313
3314 self->reason = reason;
3315
3316 processor->last_dispatch = ctime;
3317 self->last_run_time = ctime;
3318 timer_update(&thread->runnable_timer, ctime);
3319 recount_switch_thread(&snap, self, get_threadtask(self));
3320
3321 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3322 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
3323 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3324
3325 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3326 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3327 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3328 }
3329
3330 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3331
3332 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3333
3334 #if KPERF
3335 kperf_off_cpu(self);
3336 #endif /* KPERF */
3337
3338 /*
3339 * This is where we actually switch thread identity,
3340 * and address space if required. However, register
3341 * state is not switched - this routine leaves the
3342 * stack and register state active on the current CPU.
3343 */
3344 TLOG(1, "thread_invoke: calling stack_handoff\n");
3345 stack_handoff(self, thread);
3346
3347 /* 'self' is now off core */
3348 assert(thread == current_thread_volatile());
3349
3350 DTRACE_SCHED(on__cpu);
3351
3352 #if KPERF
3353 kperf_on_cpu(thread, continuation, NULL);
3354 #endif /* KPERF */
3355
3356
3357 recount_log_switch_thread_on(&snap);
3358
3359 thread_dispatch(self, thread);
3360
3361 #if KASAN
3362 /* Old thread's stack has been moved to the new thread, so explicitly
3363 * unpoison it. */
3364 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3365 #endif
3366
3367 thread->continuation = thread->parameter = NULL;
3368
3369 boolean_t enable_interrupts = TRUE;
3370
3371 /* idle thread needs to stay interrupts-disabled */
3372 if ((thread->state & TH_IDLE)) {
3373 enable_interrupts = FALSE;
3374 }
3375
3376 assert(continuation);
3377 call_continuation(continuation, parameter,
3378 thread->wait_result, enable_interrupts);
3379 /*NOTREACHED*/
3380 } else if (thread == self) {
3381 /* same thread but with continuation */
3382 ast_context(self);
3383
3384 thread_unlock(self);
3385
3386 #if KPERF
3387 kperf_on_cpu(thread, continuation, NULL);
3388 #endif /* KPERF */
3389
3390 recount_log_switch_thread_on(&snap);
3391
3392 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3393 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3394 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3395
3396 #if KASAN
3397 /* stack handoff to self - no thread_dispatch(), so clear the stack
3398 * and free the fakestack directly */
3399 #if KASAN_CLASSIC
3400 kasan_fakestack_drop(self);
3401 kasan_fakestack_gc(self);
3402 #endif /* KASAN_CLASSIC */
3403 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3404 #endif /* KASAN */
3405
3406 self->continuation = self->parameter = NULL;
3407
3408 boolean_t enable_interrupts = TRUE;
3409
3410 /* idle thread needs to stay interrupts-disabled */
3411 if ((self->state & TH_IDLE)) {
3412 enable_interrupts = FALSE;
3413 }
3414
3415 call_continuation(continuation, parameter,
3416 self->wait_result, enable_interrupts);
3417 /*NOTREACHED*/
3418 }
3419 } else {
3420 /*
3421 * Check that the other thread has a stack
3422 */
3423 if (!thread->kernel_stack) {
3424 need_stack:
3425 if (!stack_alloc_try(thread)) {
3426 thread_unlock(thread);
3427 thread_stack_enqueue(thread);
3428 return FALSE;
3429 }
3430 } else if (thread == self) {
3431 ast_context(self);
3432 thread_unlock(self);
3433
3434 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3435 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3436 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3437
3438 return TRUE;
3439 }
3440 }
3441
3442 /*
3443 * Context switch by full context save.
3444 */
3445 processor->active_thread = thread;
3446 processor_state_update_from_thread(processor, thread, false);
3447
3448 if (thread->last_processor != processor && thread->last_processor != NULL) {
3449 if (thread->last_processor->processor_set != processor->processor_set) {
3450 thread->ps_switch++;
3451 }
3452 thread->p_switch++;
3453 }
3454 thread->last_processor = processor;
3455 thread->c_switch++;
3456 ast_context(thread);
3457
3458 thread_unlock(thread);
3459
3460 self->reason = reason;
3461
3462 processor->last_dispatch = ctime;
3463 self->last_run_time = ctime;
3464 timer_update(&thread->runnable_timer, ctime);
3465 recount_switch_thread(&snap, self, get_threadtask(self));
3466
3467 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3468 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3469 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3470
3471 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3472 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3473 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3474 }
3475
3476 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3477
3478 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3479
3480 #if KPERF
3481 kperf_off_cpu(self);
3482 #endif /* KPERF */
3483
3484 /*
3485 * This is where we actually switch register context,
3486 * and address space if required. We will next run
3487 * as a result of a subsequent context switch.
3488 *
3489 * Once registers are switched and the processor is running "thread",
3490 * the stack variables and non-volatile registers will contain whatever
3491 * was there the last time that thread blocked. No local variables should
3492 * be used after this point, except for the special case of "thread", which
3493 * the platform layer returns as the previous thread running on the processor
3494 * via the function call ABI as a return register, and "self", which may have
3495 * been stored on the stack or a non-volatile register, but a stale idea of
3496 * what was on the CPU is newly-accurate because that thread is again
3497 * running on the CPU.
3498 *
3499 * If one of the threads is using a continuation, thread_continue
3500 * is used to stitch up its context.
3501 *
3502 * If we are invoking a thread which is resuming from a continuation,
3503 * the CPU will invoke thread_continue next.
3504 *
3505 * If the current thread is parking in a continuation, then its state
3506 * won't be saved and the stack will be discarded. When the stack is
3507 * re-allocated, it will be configured to resume from thread_continue.
3508 */
3509
3510 assert(continuation == self->continuation);
3511 thread = machine_switch_context(self, continuation, thread);
3512 assert(self == current_thread_volatile());
3513 TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3514
3515 assert(continuation == NULL && self->continuation == NULL);
3516
3517 DTRACE_SCHED(on__cpu);
3518
3519 #if KPERF
3520 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3521 #endif /* KPERF */
3522
3523
3524 /* Previous snap on the old stack is gone. */
3525 recount_log_switch_thread_on(NULL);
3526
3527 /* We have been resumed and are set to run. */
3528 thread_dispatch(thread, self);
3529
3530 return TRUE;
3531 }
3532
3533 #if defined(CONFIG_SCHED_DEFERRED_AST)
3534 /*
3535 * pset_cancel_deferred_dispatch:
3536 *
3537 * Cancels all ASTs that we can cancel for the given processor set
3538 * if the current processor is running the last runnable thread in the
3539 * system.
3540 *
3541 * This function assumes the current thread is runnable. This must
3542 * be called with the pset unlocked.
3543 */
3544 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3545 pset_cancel_deferred_dispatch(
3546 processor_set_t pset,
3547 processor_t processor)
3548 {
3549 processor_t active_processor = NULL;
3550 uint32_t sampled_sched_run_count;
3551
3552 pset_lock(pset);
3553 sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3554
3555 /*
3556 * If we have emptied the run queue, and our current thread is runnable, we
3557 * should tell any processors that are still DISPATCHING that they will
3558 * probably not have any work to do. In the event that there are no
3559 * pending signals that we can cancel, this is also uninteresting.
3560 *
3561 * In the unlikely event that another thread becomes runnable while we are
3562 * doing this (sched_run_count is atomically updated, not guarded), the
3563 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3564 * in order to dispatch it to a processor in our pset. So, the other
3565 * codepath will wait while we squash all cancelable ASTs, get the pset
3566 * lock, and then dispatch the freshly runnable thread. So this should be
3567 * correct (we won't accidentally have a runnable thread that hasn't been
3568 * dispatched to an idle processor), if not ideal (we may be restarting the
3569 * dispatch process, which could have some overhead).
3570 */
3571
3572 if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3573 uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3574 pset->pending_deferred_AST_cpu_mask &
3575 ~pset->pending_AST_URGENT_cpu_mask);
3576 for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3577 active_processor = processor_array[cpuid];
3578 /*
3579 * If a processor is DISPATCHING, it could be because of
3580 * a cancelable signal.
3581 *
3582 * IF the processor is not our
3583 * current processor (the current processor should not
3584 * be DISPATCHING, so this is a bit paranoid), AND there
3585 * is a cancelable signal pending on the processor, AND
3586 * there is no non-cancelable signal pending (as there is
3587 * no point trying to backtrack on bringing the processor
3588 * up if a signal we cannot cancel is outstanding), THEN
3589 * it should make sense to roll back the processor state
3590 * to the IDLE state.
3591 *
3592 * If the racey nature of this approach (as the signal
3593 * will be arbitrated by hardware, and can fire as we
3594 * roll back state) results in the core responding
3595 * despite being pushed back to the IDLE state, it
3596 * should be no different than if the core took some
3597 * interrupt while IDLE.
3598 */
3599 if (active_processor != processor) {
3600 /*
3601 * Squash all of the processor state back to some
3602 * reasonable facsimile of PROCESSOR_IDLE.
3603 */
3604
3605 processor_state_update_idle(active_processor);
3606 active_processor->deadline = RT_DEADLINE_NONE;
3607 pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3608 bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3609 machine_signal_idle_cancel(active_processor);
3610 }
3611 }
3612 }
3613
3614 pset_unlock(pset);
3615 }
3616 #else
3617 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3618 #endif
3619
3620 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3621 thread_csw_callout(
3622 thread_t old,
3623 thread_t new,
3624 uint64_t timestamp)
3625 {
3626 perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3627 uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3628 machine_switch_perfcontrol_context(event, timestamp, 0,
3629 same_pri_latency, old, new);
3630 }
3631
3632
3633 /*
3634 * thread_dispatch:
3635 *
3636 * Handle threads at context switch. Re-dispatch other thread
3637 * if still running, otherwise update run state and perform
3638 * special actions. Update quantum for other thread and begin
3639 * the quantum for ourselves.
3640 *
3641 * "thread" is the old thread that we have switched away from.
3642 * "self" is the new current thread that we have context switched to
3643 *
3644 * Called at splsched.
3645 *
3646 */
3647 void
thread_dispatch(thread_t thread,thread_t self)3648 thread_dispatch(
3649 thread_t thread,
3650 thread_t self)
3651 {
3652 processor_t processor = self->last_processor;
3653 bool was_idle = false;
3654 bool processor_bootstrap = (thread == THREAD_NULL);
3655
3656 assert(processor == current_processor());
3657 assert(self == current_thread_volatile());
3658 assert(thread != self);
3659
3660 if (thread != THREAD_NULL) {
3661 /*
3662 * Do the perfcontrol callout for context switch.
3663 * The reason we do this here is:
3664 * - thread_dispatch() is called from various places that are not
3665 * the direct context switch path for eg. processor shutdown etc.
3666 * So adding the callout here covers all those cases.
3667 * - We want this callout as early as possible to be close
3668 * to the timestamp taken in thread_invoke()
3669 * - We want to avoid holding the thread lock while doing the
3670 * callout
3671 * - We do not want to callout if "thread" is NULL.
3672 */
3673 thread_csw_callout(thread, self, processor->last_dispatch);
3674
3675 #if KASAN
3676 if (thread->continuation != NULL) {
3677 /*
3678 * Thread has a continuation and the normal stack is going away.
3679 * Unpoison the stack and mark all fakestack objects as unused.
3680 */
3681 #if KASAN_CLASSIC
3682 kasan_fakestack_drop(thread);
3683 #endif /* KASAN_CLASSIC */
3684 if (thread->kernel_stack) {
3685 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3686 }
3687 }
3688
3689
3690 #if KASAN_CLASSIC
3691 /*
3692 * Free all unused fakestack objects.
3693 */
3694 kasan_fakestack_gc(thread);
3695 #endif /* KASAN_CLASSIC */
3696 #endif /* KASAN */
3697
3698 /*
3699 * If blocked at a continuation, discard
3700 * the stack.
3701 */
3702 if (thread->continuation != NULL && thread->kernel_stack != 0) {
3703 stack_free(thread);
3704 }
3705
3706 if (thread->state & TH_IDLE) {
3707 was_idle = true;
3708 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3709 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3710 (uintptr_t)thread_tid(thread), 0, thread->state,
3711 sched_run_buckets[TH_BUCKET_RUN], 0);
3712 } else {
3713 int64_t consumed;
3714 int64_t remainder = 0;
3715
3716 if (processor->quantum_end > processor->last_dispatch) {
3717 remainder = processor->quantum_end -
3718 processor->last_dispatch;
3719 }
3720
3721 consumed = thread->quantum_remaining - remainder;
3722
3723 if ((thread->reason & AST_LEDGER) == 0) {
3724 /*
3725 * Bill CPU time to both the task and
3726 * the individual thread.
3727 */
3728 ledger_credit_thread(thread, thread->t_ledger,
3729 task_ledgers.cpu_time, consumed);
3730 ledger_credit_thread(thread, thread->t_threadledger,
3731 thread_ledgers.cpu_time, consumed);
3732 if (thread->t_bankledger) {
3733 ledger_credit_thread(thread, thread->t_bankledger,
3734 bank_ledgers.cpu_time,
3735 (consumed - thread->t_deduct_bank_ledger_time));
3736 }
3737 thread->t_deduct_bank_ledger_time = 0;
3738 if (consumed > 0) {
3739 /*
3740 * This should never be negative, but in traces we are seeing some instances
3741 * of consumed being negative.
3742 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3743 */
3744 sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3745 }
3746 }
3747
3748 /* For the thread that we just context switched away from, figure
3749 * out if we have expired the wq quantum and set the AST if we have
3750 */
3751 if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3752 thread_evaluate_workqueue_quantum_expiry(thread);
3753 }
3754
3755 if (__improbable(thread->rwlock_count != 0)) {
3756 smr_mark_active_trackers_stalled(thread);
3757 }
3758
3759 /*
3760 * Pairs with task_restartable_ranges_synchronize
3761 */
3762 wake_lock(thread);
3763 thread_lock(thread);
3764
3765 /*
3766 * Same as ast_check(), in case we missed the IPI
3767 */
3768 thread_reset_pcs_ack_IPI(thread);
3769
3770 /*
3771 * Apply a priority floor if the thread holds a kernel resource
3772 * or explicitly requested it.
3773 * Do this before checking starting_pri to avoid overpenalizing
3774 * repeated rwlock blockers.
3775 */
3776 if (__improbable(thread->rwlock_count != 0)) {
3777 lck_rw_set_promotion_locked(thread);
3778 }
3779 if (__improbable(thread->priority_floor_count != 0)) {
3780 thread_floor_boost_set_promotion_locked(thread);
3781 }
3782
3783 boolean_t keep_quantum = processor->first_timeslice;
3784
3785 /*
3786 * Treat a thread which has dropped priority since it got on core
3787 * as having expired its quantum.
3788 */
3789 if (processor->starting_pri > thread->sched_pri) {
3790 keep_quantum = FALSE;
3791 }
3792
3793 /* Compute remainder of current quantum. */
3794 if (keep_quantum &&
3795 processor->quantum_end > processor->last_dispatch) {
3796 thread->quantum_remaining = (uint32_t)remainder;
3797 } else {
3798 thread->quantum_remaining = 0;
3799 }
3800
3801 if (thread->sched_mode == TH_MODE_REALTIME) {
3802 /*
3803 * Cancel the deadline if the thread has
3804 * consumed the entire quantum.
3805 */
3806 if (thread->quantum_remaining == 0) {
3807 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3808 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3809 thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3810 }
3811 } else {
3812 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3813 /*
3814 * For non-realtime threads treat a tiny
3815 * remaining quantum as an expired quantum
3816 * but include what's left next time.
3817 */
3818 if (thread->quantum_remaining < min_std_quantum) {
3819 thread->reason |= AST_QUANTUM;
3820 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3821 }
3822 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3823 }
3824
3825 /*
3826 * If we are doing a direct handoff then
3827 * take the remainder of the quantum.
3828 */
3829 if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3830 self->quantum_remaining = thread->quantum_remaining;
3831 thread->reason |= AST_QUANTUM;
3832 thread->quantum_remaining = 0;
3833 }
3834
3835 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3836
3837 if (!(thread->state & TH_WAIT)) {
3838 /*
3839 * Still runnable.
3840 */
3841 thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3842
3843 machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3844
3845 ast_t reason = thread->reason;
3846 sched_options_t options = SCHED_NONE;
3847
3848 if (reason & AST_REBALANCE) {
3849 options |= SCHED_REBALANCE;
3850 if (reason & AST_QUANTUM) {
3851 /*
3852 * Having gone to the trouble of forcing this thread off a less preferred core,
3853 * we should force the preferable core to reschedule immediately to give this
3854 * thread a chance to run instead of just sitting on the run queue where
3855 * it may just be stolen back by the idle core we just forced it off.
3856 * But only do this at the end of a quantum to prevent cascading effects.
3857 */
3858 options |= SCHED_PREEMPT;
3859 }
3860 }
3861
3862 if (reason & AST_QUANTUM) {
3863 options |= SCHED_TAILQ;
3864 } else if (reason & AST_PREEMPT) {
3865 options |= SCHED_HEADQ;
3866 } else {
3867 options |= (SCHED_PREEMPT | SCHED_TAILQ);
3868 }
3869
3870 thread_setrun(thread, options);
3871
3872 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3873 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3874 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3875 sched_run_buckets[TH_BUCKET_RUN], 0);
3876
3877 if (thread->wake_active) {
3878 thread->wake_active = FALSE;
3879 thread_unlock(thread);
3880
3881 thread_wakeup(&thread->wake_active);
3882 } else {
3883 thread_unlock(thread);
3884 }
3885
3886 wake_unlock(thread);
3887 } else {
3888 /*
3889 * Waiting.
3890 */
3891 boolean_t should_terminate = FALSE;
3892 uint32_t new_run_count;
3893 int thread_state = thread->state;
3894
3895 /* Only the first call to thread_dispatch
3896 * after explicit termination should add
3897 * the thread to the termination queue
3898 */
3899 if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3900 should_terminate = TRUE;
3901 thread_state |= TH_TERMINATE2;
3902 }
3903
3904 timer_stop(&thread->runnable_timer, processor->last_dispatch);
3905
3906 thread_state &= ~TH_RUN;
3907 thread->state = thread_state;
3908
3909 thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3910 thread->chosen_processor = PROCESSOR_NULL;
3911
3912 new_run_count = SCHED(run_count_decr)(thread);
3913
3914 #if CONFIG_SCHED_AUTO_JOIN
3915 if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3916 work_interval_auto_join_unwind(thread);
3917 }
3918 #endif /* CONFIG_SCHED_AUTO_JOIN */
3919
3920 #if CONFIG_SCHED_SFI
3921 if (thread->reason & AST_SFI) {
3922 thread->wait_sfi_begin_time = processor->last_dispatch;
3923 }
3924 #endif
3925 machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3926
3927 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3928 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3929 (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3930 new_run_count, 0);
3931
3932 if (thread_state & TH_WAIT_REPORT) {
3933 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3934 }
3935
3936 if (thread->wake_active) {
3937 thread->wake_active = FALSE;
3938 thread_unlock(thread);
3939
3940 thread_wakeup(&thread->wake_active);
3941 } else {
3942 thread_unlock(thread);
3943 }
3944
3945 wake_unlock(thread);
3946
3947 if (should_terminate) {
3948 thread_terminate_enqueue(thread);
3949 }
3950 }
3951 }
3952 /*
3953 * The thread could have been added to the termination queue, so it's
3954 * unsafe to use after this point.
3955 */
3956 thread = THREAD_NULL;
3957 }
3958
3959 int urgency = THREAD_URGENCY_NONE;
3960 uint64_t latency = 0;
3961
3962 /* Update (new) current thread and reprogram running timers */
3963 thread_lock(self);
3964
3965 if (!(self->state & TH_IDLE)) {
3966 uint64_t arg1, arg2;
3967
3968 #if CONFIG_SCHED_SFI
3969 ast_t new_ast;
3970
3971 new_ast = sfi_thread_needs_ast(self, NULL);
3972
3973 if (new_ast != AST_NONE) {
3974 ast_on(new_ast);
3975 }
3976 #endif
3977
3978 if (processor->last_dispatch < self->last_made_runnable_time) {
3979 panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3980 processor->last_dispatch, self->last_made_runnable_time);
3981 }
3982
3983 assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3984
3985 latency = processor->last_dispatch - self->last_made_runnable_time;
3986 assert(latency >= self->same_pri_latency);
3987
3988 urgency = thread_get_urgency(self, &arg1, &arg2);
3989
3990 thread_tell_urgency(urgency, arg1, arg2, latency, self);
3991
3992 /*
3993 * Start a new CPU limit interval if the previous one has
3994 * expired. This should happen before initializing a new
3995 * quantum.
3996 */
3997 if (cpulimit_affects_quantum &&
3998 thread_cpulimit_interval_has_expired(processor->last_dispatch)) {
3999 thread_cpulimit_restart(processor->last_dispatch);
4000 }
4001
4002 /*
4003 * Get a new quantum if none remaining.
4004 */
4005 if (self->quantum_remaining == 0) {
4006 thread_quantum_init(self, processor->last_dispatch);
4007 }
4008
4009 /*
4010 * Set up quantum timer and timeslice.
4011 */
4012 processor->quantum_end = processor->last_dispatch +
4013 self->quantum_remaining;
4014
4015 running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
4016 processor->quantum_end, processor->last_dispatch);
4017 if (was_idle) {
4018 /*
4019 * kperf's running timer is active whenever the idle thread for a
4020 * CPU is not running.
4021 */
4022 kperf_running_setup(processor, processor->last_dispatch);
4023 }
4024 running_timers_activate(processor);
4025 processor->first_timeslice = TRUE;
4026 } else {
4027 if (!processor_bootstrap) {
4028 running_timers_deactivate(processor);
4029 }
4030 processor->first_timeslice = FALSE;
4031 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
4032 }
4033
4034 assert(self->block_hint == kThreadWaitNone);
4035 self->computation_epoch = processor->last_dispatch;
4036 /*
4037 * This relies on the interrupt time being tallied up to the thread in the
4038 * exception handler epilogue, which is before AST context where preemption
4039 * is considered (and the scheduler is potentially invoked to
4040 * context switch, here).
4041 */
4042 self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach();
4043 self->reason = AST_NONE;
4044 processor->starting_pri = self->sched_pri;
4045
4046 thread_unlock(self);
4047
4048 machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
4049 processor->last_dispatch);
4050
4051 #if defined(CONFIG_SCHED_DEFERRED_AST)
4052 /*
4053 * TODO: Can we state that redispatching our old thread is also
4054 * uninteresting?
4055 */
4056 if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
4057 pset_cancel_deferred_dispatch(processor->processor_set, processor);
4058 }
4059 #endif
4060 }
4061
4062 /*
4063 * thread_block_reason:
4064 *
4065 * Forces a reschedule, blocking the caller if a wait
4066 * has been asserted.
4067 *
4068 * If a continuation is specified, then thread_invoke will
4069 * attempt to discard the thread's kernel stack. When the
4070 * thread resumes, it will execute the continuation function
4071 * on a new kernel stack.
4072 */
4073 wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)4074 thread_block_reason(
4075 thread_continue_t continuation,
4076 void *parameter,
4077 ast_t reason)
4078 {
4079 thread_t self = current_thread();
4080 processor_t processor;
4081 thread_t new_thread;
4082 spl_t s;
4083
4084 s = splsched();
4085
4086 processor = current_processor();
4087
4088 /* If we're explicitly yielding, force a subsequent quantum */
4089 if (reason & AST_YIELD) {
4090 processor->first_timeslice = FALSE;
4091 }
4092
4093 /* We're handling all scheduling AST's */
4094 ast_off(AST_SCHEDULING);
4095
4096 clear_pending_nonurgent_preemption(processor);
4097
4098 #if PROC_REF_DEBUG
4099 if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
4100 uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
4101 }
4102 #endif
4103
4104 #if CONFIG_EXCLAVES
4105 if (continuation != NULL) {
4106 assert3u(self->th_exclaves_state & TH_EXCLAVES_STATE_ANY, ==, 0);
4107 }
4108 #endif /* CONFIG_EXCLAVES */
4109
4110 self->continuation = continuation;
4111 self->parameter = parameter;
4112
4113 if (self->state & ~(TH_RUN | TH_IDLE)) {
4114 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4115 MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
4116 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
4117 }
4118
4119 do {
4120 thread_lock(self);
4121 new_thread = thread_select(self, processor, &reason);
4122 thread_unlock(self);
4123 } while (!thread_invoke(self, new_thread, reason));
4124
4125 splx(s);
4126
4127 return self->wait_result;
4128 }
4129
4130 /*
4131 * thread_block:
4132 *
4133 * Block the current thread if a wait has been asserted.
4134 */
4135 wait_result_t
thread_block(thread_continue_t continuation)4136 thread_block(
4137 thread_continue_t continuation)
4138 {
4139 return thread_block_reason(continuation, NULL, AST_NONE);
4140 }
4141
4142 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)4143 thread_block_parameter(
4144 thread_continue_t continuation,
4145 void *parameter)
4146 {
4147 return thread_block_reason(continuation, parameter, AST_NONE);
4148 }
4149
4150 /*
4151 * thread_run:
4152 *
4153 * Switch directly from the current thread to the
4154 * new thread, handing off our quantum if appropriate.
4155 *
4156 * New thread must be runnable, and not on a run queue.
4157 *
4158 * Called at splsched.
4159 */
4160 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)4161 thread_run(
4162 thread_t self,
4163 thread_continue_t continuation,
4164 void *parameter,
4165 thread_t new_thread)
4166 {
4167 ast_t reason = AST_NONE;
4168
4169 if ((self->state & TH_IDLE) == 0) {
4170 reason = AST_HANDOFF;
4171 }
4172
4173 /* Must not get here without a chosen processor */
4174 assert(new_thread->chosen_processor);
4175
4176 self->continuation = continuation;
4177 self->parameter = parameter;
4178
4179 while (!thread_invoke(self, new_thread, reason)) {
4180 /* the handoff failed, so we have to fall back to the normal block path */
4181 processor_t processor = current_processor();
4182
4183 reason = AST_NONE;
4184
4185 thread_lock(self);
4186 new_thread = thread_select(self, processor, &reason);
4187 thread_unlock(self);
4188 }
4189
4190 return self->wait_result;
4191 }
4192
4193 /*
4194 * thread_continue:
4195 *
4196 * Called at splsched when a thread first receives
4197 * a new stack after a continuation.
4198 *
4199 * Called with THREAD_NULL as the old thread when
4200 * invoked by machine_load_context.
4201 */
4202 void
thread_continue(thread_t thread)4203 thread_continue(
4204 thread_t thread)
4205 {
4206 thread_t self = current_thread();
4207 thread_continue_t continuation;
4208 void *parameter;
4209
4210 DTRACE_SCHED(on__cpu);
4211
4212 continuation = self->continuation;
4213 parameter = self->parameter;
4214
4215 assert(continuation != NULL);
4216
4217 #if KPERF
4218 kperf_on_cpu(self, continuation, NULL);
4219 #endif
4220
4221
4222 thread_dispatch(thread, self);
4223
4224 self->continuation = self->parameter = NULL;
4225
4226 #if SCHED_HYGIENE_DEBUG
4227 /* Reset interrupt-masked spin debugging timeout */
4228 ml_spin_debug_clear(self);
4229 #endif
4230
4231 TLOG(1, "thread_continue: calling call_continuation\n");
4232
4233 boolean_t enable_interrupts = TRUE;
4234
4235 /* bootstrap thread, idle thread need to stay interrupts-disabled */
4236 if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
4237 enable_interrupts = FALSE;
4238 }
4239
4240 #if KASAN_TBI
4241 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
4242 #endif /* KASAN_TBI */
4243
4244
4245 call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
4246 /*NOTREACHED*/
4247 }
4248
4249 void
thread_quantum_init(thread_t thread,uint64_t now)4250 thread_quantum_init(thread_t thread, uint64_t now)
4251 {
4252 uint64_t new_quantum = 0;
4253
4254 switch (thread->sched_mode) {
4255 case TH_MODE_REALTIME:
4256 new_quantum = thread->realtime.computation;
4257 new_quantum = MIN(new_quantum, max_unsafe_rt_computation);
4258 break;
4259
4260 case TH_MODE_FIXED:
4261 new_quantum = SCHED(initial_quantum_size)(thread);
4262 new_quantum = MIN(new_quantum, max_unsafe_fixed_computation);
4263 break;
4264
4265 default:
4266 new_quantum = SCHED(initial_quantum_size)(thread);
4267 break;
4268 }
4269
4270 if (cpulimit_affects_quantum) {
4271 const uint64_t cpulimit_remaining = thread_cpulimit_remaining(now);
4272
4273 /*
4274 * If there's no remaining CPU time, the ledger system will
4275 * notice and put the thread to sleep.
4276 */
4277 if (cpulimit_remaining > 0) {
4278 new_quantum = MIN(new_quantum, cpulimit_remaining);
4279 }
4280 }
4281
4282 assert3u(new_quantum, <, UINT32_MAX);
4283 assert3u(new_quantum, >, 0);
4284
4285 thread->quantum_remaining = (uint32_t)new_quantum;
4286 }
4287
4288 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)4289 sched_timeshare_initial_quantum_size(thread_t thread)
4290 {
4291 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
4292 return bg_quantum;
4293 } else {
4294 return std_quantum;
4295 }
4296 }
4297
4298 /*
4299 * run_queue_init:
4300 *
4301 * Initialize a run queue before first use.
4302 */
4303 void
run_queue_init(run_queue_t rq)4304 run_queue_init(
4305 run_queue_t rq)
4306 {
4307 rq->highq = NOPRI;
4308 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
4309 rq->bitmap[i] = 0;
4310 }
4311 rq->urgency = rq->count = 0;
4312 for (int i = 0; i < NRQS; i++) {
4313 circle_queue_init(&rq->queues[i]);
4314 }
4315 }
4316
4317 /*
4318 * run_queue_dequeue:
4319 *
4320 * Perform a dequeue operation on a run queue,
4321 * and return the resulting thread.
4322 *
4323 * The run queue must be locked (see thread_run_queue_remove()
4324 * for more info), and not empty.
4325 */
4326 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)4327 run_queue_dequeue(
4328 run_queue_t rq,
4329 sched_options_t options)
4330 {
4331 thread_t thread;
4332 circle_queue_t queue = &rq->queues[rq->highq];
4333
4334 if (options & SCHED_HEADQ) {
4335 thread = cqe_dequeue_head(queue, struct thread, runq_links);
4336 } else {
4337 thread = cqe_dequeue_tail(queue, struct thread, runq_links);
4338 }
4339
4340 assert(thread != THREAD_NULL);
4341 assert_thread_magic(thread);
4342
4343 thread_clear_runq(thread);
4344 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4345 rq->count--;
4346 if (SCHED(priority_is_urgent)(rq->highq)) {
4347 rq->urgency--; assert(rq->urgency >= 0);
4348 }
4349 if (circle_queue_empty(queue)) {
4350 bitmap_clear(rq->bitmap, rq->highq);
4351 rq->highq = bitmap_first(rq->bitmap, NRQS);
4352 }
4353
4354 return thread;
4355 }
4356
4357 /*
4358 * run_queue_enqueue:
4359 *
4360 * Perform a enqueue operation on a run queue.
4361 *
4362 * The run queue must be locked (see thread_run_queue_remove()
4363 * for more info).
4364 */
4365 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)4366 run_queue_enqueue(
4367 run_queue_t rq,
4368 thread_t thread,
4369 sched_options_t options)
4370 {
4371 circle_queue_t queue = &rq->queues[thread->sched_pri];
4372 boolean_t result = FALSE;
4373
4374 assert_thread_magic(thread);
4375
4376 if (circle_queue_empty(queue)) {
4377 circle_enqueue_tail(queue, &thread->runq_links);
4378
4379 rq_bitmap_set(rq->bitmap, thread->sched_pri);
4380 if (thread->sched_pri > rq->highq) {
4381 rq->highq = thread->sched_pri;
4382 result = TRUE;
4383 }
4384 } else {
4385 if (options & SCHED_TAILQ) {
4386 circle_enqueue_tail(queue, &thread->runq_links);
4387 } else {
4388 circle_enqueue_head(queue, &thread->runq_links);
4389 }
4390 }
4391 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4392 rq->urgency++;
4393 }
4394 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4395 rq->count++;
4396
4397 return result;
4398 }
4399
4400 /*
4401 * run_queue_remove:
4402 *
4403 * Remove a specific thread from a runqueue.
4404 *
4405 * The run queue must be locked.
4406 */
4407 void
run_queue_remove(run_queue_t rq,thread_t thread)4408 run_queue_remove(
4409 run_queue_t rq,
4410 thread_t thread)
4411 {
4412 circle_queue_t queue = &rq->queues[thread->sched_pri];
4413
4414 thread_assert_runq_nonnull(thread);
4415 assert_thread_magic(thread);
4416
4417 circle_dequeue(queue, &thread->runq_links);
4418 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4419 rq->count--;
4420 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4421 rq->urgency--; assert(rq->urgency >= 0);
4422 }
4423
4424 if (circle_queue_empty(queue)) {
4425 /* update run queue status */
4426 bitmap_clear(rq->bitmap, thread->sched_pri);
4427 rq->highq = bitmap_first(rq->bitmap, NRQS);
4428 }
4429
4430 thread_clear_runq(thread);
4431 }
4432
4433 /*
4434 * run_queue_peek
4435 *
4436 * Peek at the runq and return the highest
4437 * priority thread from the runq.
4438 *
4439 * The run queue must be locked.
4440 */
4441 thread_t
run_queue_peek(run_queue_t rq)4442 run_queue_peek(
4443 run_queue_t rq)
4444 {
4445 if (rq->count > 0) {
4446 circle_queue_t queue = &rq->queues[rq->highq];
4447 thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4448 assert_thread_magic(thread);
4449 return thread;
4450 } else {
4451 return THREAD_NULL;
4452 }
4453 }
4454
4455 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)4456 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4457 {
4458 int pri = thread->sched_pri;
4459 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4460 int i = pri - BASEPRI_RTQUEUES;
4461 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4462 bitmap_t *map = rt_run_queue->bitmap;
4463
4464 bitmap_set(map, i);
4465
4466 queue_t queue = &rt_runq->pri_queue;
4467 uint64_t deadline = thread->realtime.deadline;
4468 bool preempt = false;
4469 bool earliest = false;
4470
4471 if (queue_empty(queue)) {
4472 enqueue_tail(queue, &thread->runq_links);
4473 preempt = true;
4474 earliest = true;
4475 rt_runq->pri_earliest_deadline = deadline;
4476 rt_runq->pri_constraint = thread->realtime.constraint;
4477 } else {
4478 /* Insert into rt_runq in thread deadline order */
4479 queue_entry_t iter;
4480 qe_foreach(iter, queue) {
4481 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4482 assert_thread_magic(iter_thread);
4483
4484 if (deadline < iter_thread->realtime.deadline) {
4485 if (iter == queue_first(queue)) {
4486 preempt = true;
4487 earliest = true;
4488 rt_runq->pri_earliest_deadline = deadline;
4489 rt_runq->pri_constraint = thread->realtime.constraint;
4490 }
4491 insque(&thread->runq_links, queue_prev(iter));
4492 break;
4493 } else if (iter == queue_last(queue)) {
4494 enqueue_tail(queue, &thread->runq_links);
4495 break;
4496 }
4497 }
4498 }
4499 if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4500 os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4501 os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4502 os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4503 }
4504
4505 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4506 rt_runq->pri_count++;
4507 os_atomic_inc(&rt_run_queue->count, relaxed);
4508
4509 thread_set_runq_locked(thread, processor);
4510
4511 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4512
4513 return preempt;
4514 }
4515
4516 static thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)4517 rt_runq_dequeue(rt_queue_t rt_run_queue)
4518 {
4519 bitmap_t *map = rt_run_queue->bitmap;
4520 int i = bitmap_first(map, NRTQS);
4521 assert((i >= 0) && (i < NRTQS));
4522
4523 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4524
4525 if (!sched_rt_runq_strict_priority) {
4526 int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4527 if (ed_index != i) {
4528 assert((ed_index >= 0) && (ed_index < NRTQS));
4529 rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4530
4531 thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4532 thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4533
4534 if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4535 /* choose the earliest deadline thread */
4536 rt_runq = ed_runq;
4537 i = ed_index;
4538 }
4539 }
4540 }
4541
4542 assert(rt_runq->pri_count > 0);
4543 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4544 uint32_t constraint = RT_CONSTRAINT_NONE;
4545 int ed_index = NOPRI;
4546 thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4547 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4548 if (--rt_runq->pri_count > 0) {
4549 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4550 assert(next_rt != THREAD_NULL);
4551 earliest_deadline = next_rt->realtime.deadline;
4552 constraint = next_rt->realtime.constraint;
4553 ed_index = i;
4554 } else {
4555 bitmap_clear(map, i);
4556 }
4557 rt_runq->pri_earliest_deadline = earliest_deadline;
4558 rt_runq->pri_constraint = constraint;
4559
4560 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4561 rt_runq = &rt_run_queue->rt_queue_pri[i];
4562 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4563 earliest_deadline = rt_runq->pri_earliest_deadline;
4564 constraint = rt_runq->pri_constraint;
4565 ed_index = i;
4566 }
4567 }
4568 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4569 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4570 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4571 os_atomic_dec(&rt_run_queue->count, relaxed);
4572
4573 thread_clear_runq(new_thread);
4574
4575 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4576
4577 return new_thread;
4578 }
4579
4580 static thread_t
rt_runq_first(rt_queue_t rt_run_queue)4581 rt_runq_first(rt_queue_t rt_run_queue)
4582 {
4583 bitmap_t *map = rt_run_queue->bitmap;
4584 int i = bitmap_first(map, NRTQS);
4585 if (i < 0) {
4586 return THREAD_NULL;
4587 }
4588 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4589 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4590
4591 return next_rt;
4592 }
4593
4594 static void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)4595 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4596 {
4597 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4598
4599 int pri = thread->sched_pri;
4600 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4601 int i = pri - BASEPRI_RTQUEUES;
4602 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4603 bitmap_t *map = rt_run_queue->bitmap;
4604
4605 assert(rt_runq->pri_count > 0);
4606 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4607 uint32_t constraint = RT_CONSTRAINT_NONE;
4608 int ed_index = NOPRI;
4609 remqueue(&thread->runq_links);
4610 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4611 if (--rt_runq->pri_count > 0) {
4612 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4613 earliest_deadline = next_rt->realtime.deadline;
4614 constraint = next_rt->realtime.constraint;
4615 ed_index = i;
4616 } else {
4617 bitmap_clear(map, i);
4618 }
4619 rt_runq->pri_earliest_deadline = earliest_deadline;
4620 rt_runq->pri_constraint = constraint;
4621
4622 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4623 rt_runq = &rt_run_queue->rt_queue_pri[i];
4624 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4625 earliest_deadline = rt_runq->pri_earliest_deadline;
4626 constraint = rt_runq->pri_constraint;
4627 ed_index = i;
4628 }
4629 }
4630 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4631 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4632 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4633 os_atomic_dec(&rt_run_queue->count, relaxed);
4634
4635 thread_clear_runq_locked(thread);
4636
4637 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4638 }
4639
4640 rt_queue_t
sched_rtlocal_runq(processor_set_t pset)4641 sched_rtlocal_runq(processor_set_t pset)
4642 {
4643 return &pset->rt_runq;
4644 }
4645
4646 void
sched_rtlocal_init(processor_set_t pset)4647 sched_rtlocal_init(processor_set_t pset)
4648 {
4649 pset_rt_init(pset);
4650 }
4651
4652 void
sched_rtlocal_queue_shutdown(processor_t processor)4653 sched_rtlocal_queue_shutdown(processor_t processor)
4654 {
4655 processor_set_t pset = processor->processor_set;
4656 thread_t thread;
4657 queue_head_t tqueue;
4658
4659 pset_lock(pset);
4660
4661 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4662 if (bit_count(pset_available_cpumap(pset)) > 0) {
4663 pset_unlock(pset);
4664 return;
4665 }
4666
4667 queue_init(&tqueue);
4668
4669 while (rt_runq_count(pset) > 0) {
4670 thread = rt_runq_dequeue(&pset->rt_runq);
4671 enqueue_tail(&tqueue, &thread->runq_links);
4672 }
4673 sched_update_pset_load_average(pset, 0);
4674 pset_update_rt_stealable_state(pset);
4675 pset_unlock(pset);
4676
4677 qe_foreach_element_safe(thread, &tqueue, runq_links) {
4678 remqueue(&thread->runq_links);
4679
4680 thread_lock(thread);
4681
4682 thread_setrun(thread, SCHED_TAILQ);
4683
4684 thread_unlock(thread);
4685 }
4686 }
4687
4688 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4689 void
sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)4690 sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4691 {
4692 thread_t thread;
4693
4694 pset_node_t node = &pset_node0;
4695 processor_set_t pset = node->psets;
4696
4697 spl_t s = splsched();
4698 do {
4699 while (pset != NULL) {
4700 pset_lock(pset);
4701
4702 bitmap_t *map = pset->rt_runq.bitmap;
4703 for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4704 rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4705
4706 qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4707 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4708 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4709 }
4710 }
4711 }
4712
4713 pset_unlock(pset);
4714
4715 pset = pset->pset_list;
4716 }
4717 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4718 splx(s);
4719 }
4720
4721 int64_t
sched_rtlocal_runq_count_sum(void)4722 sched_rtlocal_runq_count_sum(void)
4723 {
4724 pset_node_t node = &pset_node0;
4725 processor_set_t pset = node->psets;
4726 int64_t count = 0;
4727
4728 do {
4729 while (pset != NULL) {
4730 count += pset->rt_runq.runq_stats.count_sum;
4731
4732 pset = pset->pset_list;
4733 }
4734 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4735
4736 return count;
4737 }
4738
4739 /*
4740 * Called with stealing_pset locked and
4741 * returns with stealing_pset locked
4742 * but the lock will have been dropped
4743 * if a thread is returned.
4744 */
4745 thread_t
sched_rtlocal_steal_thread(processor_set_t stealing_pset,uint64_t earliest_deadline)4746 sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4747 {
4748 if (!sched_allow_rt_steal) {
4749 return THREAD_NULL;
4750 }
4751 pset_map_t pset_map = stealing_pset->node->pset_map;
4752
4753 bit_clear(pset_map, stealing_pset->pset_id);
4754
4755 processor_set_t pset = stealing_pset;
4756
4757 processor_set_t target_pset;
4758 uint64_t target_deadline;
4759
4760 retry:
4761 target_pset = NULL;
4762 target_deadline = earliest_deadline - rt_deadline_epsilon;
4763
4764 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
4765 processor_set_t nset = pset_array[pset_id];
4766
4767 /*
4768 * During startup, while pset_array[] and node->pset_map are still being initialized,
4769 * the update to pset_map may become visible to this cpu before the update to pset_array[].
4770 * It would be good to avoid inserting a memory barrier here that is only needed during startup,
4771 * so just check nset is not NULL instead.
4772 */
4773 if (nset && (nset->stealable_rt_threads_earliest_deadline < target_deadline)) {
4774 target_deadline = nset->stealable_rt_threads_earliest_deadline;
4775 target_pset = nset;
4776 }
4777 }
4778
4779 if (target_pset != NULL) {
4780 pset = change_locked_pset(pset, target_pset);
4781 if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4782 thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
4783 pset_update_rt_stealable_state(pset);
4784 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4785
4786 pset = change_locked_pset(pset, stealing_pset);
4787 return new_thread;
4788 }
4789 pset = change_locked_pset(pset, stealing_pset);
4790 earliest_deadline = rt_runq_earliest_deadline(pset);
4791 goto retry;
4792 }
4793
4794 pset = change_locked_pset(pset, stealing_pset);
4795 return THREAD_NULL;
4796 }
4797
4798 /*
4799 * pset is locked
4800 */
4801 thread_t
sched_rt_choose_thread(processor_set_t pset)4802 sched_rt_choose_thread(processor_set_t pset)
4803 {
4804 processor_t processor = current_processor();
4805
4806 if (SCHED(steal_thread_enabled)(pset)) {
4807 do {
4808 bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4809 if (spill_pending) {
4810 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4811 }
4812 thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4813 if (new_thread != THREAD_NULL) {
4814 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4815 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4816 }
4817 return new_thread;
4818 }
4819 } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4820 }
4821
4822 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4823 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4824 }
4825
4826 if (rt_runq_count(pset) > 0) {
4827 thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4828 assert(new_thread != THREAD_NULL);
4829 pset_update_rt_stealable_state(pset);
4830 return new_thread;
4831 }
4832
4833 return THREAD_NULL;
4834 }
4835
4836 /*
4837 * realtime_queue_insert:
4838 *
4839 * Enqueue a thread for realtime execution.
4840 */
4841 static bool
realtime_queue_insert(processor_t processor,processor_set_t pset,thread_t thread)4842 realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4843 {
4844 pset_assert_locked(pset);
4845
4846 bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4847 pset_update_rt_stealable_state(pset);
4848
4849 return preempt;
4850 }
4851
4852 /*
4853 * realtime_setrun:
4854 *
4855 * Dispatch a thread for realtime execution.
4856 *
4857 * Thread must be locked. Associated pset must
4858 * be locked, and is returned unlocked.
4859 */
4860 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4861 realtime_setrun(
4862 processor_t chosen_processor,
4863 thread_t thread)
4864 {
4865 processor_set_t pset = chosen_processor->processor_set;
4866 pset_assert_locked(pset);
4867 bool pset_is_locked = true;
4868
4869 int n_backup = 0;
4870
4871 if (thread->realtime.constraint <= rt_constraint_threshold) {
4872 n_backup = sched_rt_n_backup_processors;
4873 }
4874 assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4875
4876 int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4877 if (existing_backups > 0) {
4878 n_backup = n_backup - existing_backups;
4879 if (n_backup < 0) {
4880 n_backup = 0;
4881 }
4882 }
4883
4884 sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4885 processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4886
4887 thread->chosen_processor = chosen_processor;
4888
4889 /* <rdar://problem/15102234> */
4890 assert(thread->bound_processor == PROCESSOR_NULL);
4891
4892 realtime_queue_insert(chosen_processor, pset, thread);
4893
4894 processor_t processor = chosen_processor;
4895
4896 int count = 0;
4897 for (int i = 0; i <= n_backup; i++) {
4898 if (i == 0) {
4899 ipi_type[i] = SCHED_IPI_NONE;
4900 ipi_processor[i] = processor;
4901 count++;
4902
4903 ast_t preempt = AST_NONE;
4904 if (thread->sched_pri > processor->current_pri) {
4905 preempt = (AST_PREEMPT | AST_URGENT);
4906 } else if (thread->sched_pri == processor->current_pri) {
4907 if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4908 preempt = (AST_PREEMPT | AST_URGENT);
4909 }
4910 }
4911
4912 if (preempt != AST_NONE) {
4913 if (processor->state == PROCESSOR_IDLE) {
4914 if (processor == current_processor()) {
4915 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4916 ast_on(preempt);
4917
4918 if ((preempt & AST_URGENT) == AST_URGENT) {
4919 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4920 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4921 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4922 }
4923 }
4924
4925 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4926 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4927 }
4928 } else {
4929 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4930 }
4931 } else if (processor->state == PROCESSOR_DISPATCHING) {
4932 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4933 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4934 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4935 }
4936 } else {
4937 if (processor == current_processor()) {
4938 ast_on(preempt);
4939
4940 if ((preempt & AST_URGENT) == AST_URGENT) {
4941 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4942 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4943 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4944 }
4945 }
4946
4947 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4948 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4949 }
4950 } else {
4951 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4952 }
4953 }
4954 } else {
4955 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4956 }
4957 } else {
4958 if (!pset_is_locked) {
4959 pset_lock(pset);
4960 }
4961 ipi_type[i] = SCHED_IPI_NONE;
4962 ipi_processor[i] = PROCESSOR_NULL;
4963 pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]);
4964 if (ipi_processor[i] == PROCESSOR_NULL) {
4965 break;
4966 }
4967 count++;
4968
4969 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4970 ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4971 #if CONFIG_SCHED_SMT
4972 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4973 if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4974 processor_t p0 = ipi_processor[0];
4975 processor_t p1 = ipi_processor[1];
4976 assert(p0 && p1);
4977 if (p_is_good(p0) && p_is_good(p1)) {
4978 /*
4979 * Both the chosen processor and the first backup are non-cpu0 primaries,
4980 * so there is no need for a 2nd backup processor.
4981 */
4982 break;
4983 }
4984 }
4985 #endif /* CONFIG_SCHED_SMT */
4986 }
4987 }
4988
4989 if (pset_is_locked) {
4990 pset_unlock(pset);
4991 }
4992
4993 assert((count > 0) && (count <= (n_backup + 1)));
4994 for (int i = 0; i < count; i++) {
4995 assert(ipi_processor[i] != PROCESSOR_NULL);
4996 sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4997 }
4998 }
4999
5000
5001 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)5002 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
5003 thread_t thread, __unused sched_ipi_event_t event)
5004 {
5005 #if defined(CONFIG_SCHED_DEFERRED_AST)
5006 #if CONFIG_THREAD_GROUPS
5007 if (thread) {
5008 struct thread_group *tg = thread_group_get(thread);
5009 if (thread_group_uses_immediate_ipi(tg)) {
5010 return SCHED_IPI_IMMEDIATE;
5011 }
5012 }
5013 #endif /* CONFIG_THREAD_GROUPS */
5014 if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
5015 return SCHED_IPI_DEFERRED;
5016 }
5017 #else /* CONFIG_SCHED_DEFERRED_AST */
5018 (void) thread;
5019 panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
5020 #endif /* CONFIG_SCHED_DEFERRED_AST */
5021 return SCHED_IPI_NONE;
5022 }
5023
5024 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)5025 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
5026 {
5027 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
5028 assert(dst != NULL);
5029
5030 processor_set_t pset = dst->processor_set;
5031 if (current_processor() == dst) {
5032 return SCHED_IPI_NONE;
5033 }
5034
5035 bool dst_idle = (dst->state == PROCESSOR_IDLE);
5036 if (dst_idle) {
5037 pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
5038 }
5039
5040 ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
5041 switch (ipi_type) {
5042 case SCHED_IPI_NONE:
5043 return SCHED_IPI_NONE;
5044 #if defined(CONFIG_SCHED_DEFERRED_AST)
5045 case SCHED_IPI_DEFERRED:
5046 bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
5047 break;
5048 #endif /* CONFIG_SCHED_DEFERRED_AST */
5049 default:
5050 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
5051 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5052 dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
5053 }
5054 bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
5055 break;
5056 }
5057 return ipi_type;
5058 }
5059
5060 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)5061 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
5062 {
5063 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
5064 boolean_t deferred_ipi_supported = false;
5065 processor_set_t pset = dst->processor_set;
5066
5067 #if defined(CONFIG_SCHED_DEFERRED_AST)
5068 deferred_ipi_supported = true;
5069 #endif /* CONFIG_SCHED_DEFERRED_AST */
5070
5071 switch (event) {
5072 case SCHED_IPI_EVENT_SPILL:
5073 case SCHED_IPI_EVENT_SMT_REBAL:
5074 case SCHED_IPI_EVENT_REBALANCE:
5075 case SCHED_IPI_EVENT_BOUND_THR:
5076 case SCHED_IPI_EVENT_RT_PREEMPT:
5077 /*
5078 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
5079 * scenarios use immediate IPIs always.
5080 */
5081 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5082 break;
5083 case SCHED_IPI_EVENT_PREEMPT:
5084 /* In the preemption case, use immediate IPIs for RT threads */
5085 if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
5086 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5087 break;
5088 }
5089
5090 /*
5091 * For Non-RT threads preemption,
5092 * If the core is active, use immediate IPIs.
5093 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
5094 */
5095 if (deferred_ipi_supported && dst_idle) {
5096 return sched_ipi_deferred_policy(pset, dst, thread, event);
5097 }
5098 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5099 break;
5100 default:
5101 panic("Unrecognized scheduler IPI event type %d", event);
5102 }
5103 assert(ipi_type != SCHED_IPI_NONE);
5104 return ipi_type;
5105 }
5106
5107 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)5108 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
5109 {
5110 switch (ipi) {
5111 case SCHED_IPI_NONE:
5112 break;
5113 case SCHED_IPI_IDLE:
5114 machine_signal_idle(dst);
5115 break;
5116 case SCHED_IPI_IMMEDIATE:
5117 cause_ast_check(dst);
5118 break;
5119 case SCHED_IPI_DEFERRED:
5120 machine_signal_idle_deferred(dst);
5121 break;
5122 default:
5123 panic("Unrecognized scheduler IPI type: %d", ipi);
5124 }
5125 }
5126
5127 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
5128
5129 boolean_t
priority_is_urgent(int priority)5130 priority_is_urgent(int priority)
5131 {
5132 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
5133 }
5134
5135 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
5136
5137 /*
5138 * processor_setrun:
5139 *
5140 * Dispatch a thread for execution on a
5141 * processor.
5142 *
5143 * Thread must be locked. Associated pset must
5144 * be locked, and is returned unlocked.
5145 */
5146 static void
processor_setrun(processor_t processor,thread_t thread,integer_t options)5147 processor_setrun(
5148 processor_t processor,
5149 thread_t thread,
5150 integer_t options)
5151 {
5152 processor_set_t pset = processor->processor_set;
5153 pset_assert_locked(pset);
5154 ast_t preempt = AST_NONE;
5155 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
5156
5157 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
5158
5159 thread->chosen_processor = processor;
5160
5161 /*
5162 * Set preemption mode.
5163 */
5164 #if defined(CONFIG_SCHED_DEFERRED_AST)
5165 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
5166 #endif
5167 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
5168 preempt = (AST_PREEMPT | AST_URGENT);
5169 } else if (processor->current_is_eagerpreempt) {
5170 preempt = (AST_PREEMPT | AST_URGENT);
5171 } else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
5172 if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
5173 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
5174 } else {
5175 preempt = AST_NONE;
5176 }
5177 } else {
5178 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
5179 }
5180
5181 if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
5182 /*
5183 * Having gone to the trouble of forcing this thread off a less preferred core,
5184 * we should force the preferable core to reschedule immediately to give this
5185 * thread a chance to run instead of just sitting on the run queue where
5186 * it may just be stolen back by the idle core we just forced it off.
5187 */
5188 preempt |= AST_PREEMPT;
5189 }
5190
5191 SCHED(processor_enqueue)(processor, thread, options);
5192 sched_update_pset_load_average(pset, 0);
5193
5194 if (preempt != AST_NONE) {
5195 if (processor->state == PROCESSOR_IDLE) {
5196 ipi_action = eExitIdle;
5197 } else if (processor->state == PROCESSOR_DISPATCHING) {
5198 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5199 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5200 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
5201 }
5202 } else if (processor->state == PROCESSOR_RUNNING &&
5203 (thread->sched_pri >= processor->current_pri)) {
5204 ipi_action = eInterruptRunning;
5205 }
5206 } else {
5207 /*
5208 * New thread is not important enough to preempt what is running, but
5209 * special processor states may need special handling
5210 */
5211 if (processor->state == PROCESSOR_IDLE) {
5212 ipi_action = eExitIdle;
5213 } else if (processor->state == PROCESSOR_DISPATCHING) {
5214 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5215 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5216 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
5217 }
5218 }
5219 }
5220
5221 if (ipi_action != eDoNothing) {
5222 if (processor == current_processor()) {
5223 if (ipi_action == eExitIdle) {
5224 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
5225 }
5226 if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
5227 ast_on(preempt);
5228 }
5229
5230 if ((preempt & AST_URGENT) == AST_URGENT) {
5231 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5232 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5233 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
5234 }
5235 } else {
5236 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5237 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
5238 }
5239 }
5240
5241 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
5242 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5243 } else {
5244 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5245 }
5246 } else {
5247 sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
5248 ipi_type = sched_ipi_action(processor, thread, event);
5249 }
5250 }
5251
5252 pset_unlock(pset);
5253 sched_ipi_perform(processor, ipi_type);
5254
5255 if (ipi_action != eDoNothing && processor == current_processor()) {
5256 ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
5257 ast_on(new_preempt);
5258 }
5259 }
5260
5261 /*
5262 * choose_next_pset:
5263 *
5264 * Return the next sibling pset containing
5265 * available processors.
5266 *
5267 * Returns the original pset if none other is
5268 * suitable.
5269 */
5270 static processor_set_t
choose_next_pset(processor_set_t pset)5271 choose_next_pset(
5272 processor_set_t pset)
5273 {
5274 processor_set_t nset = pset;
5275
5276 do {
5277 nset = next_pset(nset);
5278
5279 /*
5280 * Sometimes during startup the pset_map can contain a bit
5281 * for a pset that isn't fully published in pset_array because
5282 * the pset_map read isn't an acquire load.
5283 *
5284 * In order to avoid needing an acquire barrier here, just bail
5285 * out.
5286 */
5287 if (nset == PROCESSOR_SET_NULL) {
5288 return pset;
5289 }
5290 } while (nset->online_processor_count < 1 && nset != pset);
5291
5292 return nset;
5293 }
5294
5295 #if CONFIG_SCHED_SMT
5296 /*
5297 * choose_processor_smt:
5298 *
5299 * SMT-aware implementation of choose_processor.
5300 */
5301 processor_t
choose_processor_smt(processor_set_t starting_pset,processor_t processor,thread_t thread)5302 choose_processor_smt(
5303 processor_set_t starting_pset,
5304 processor_t processor,
5305 thread_t thread)
5306 {
5307 processor_set_t pset = starting_pset;
5308 processor_set_t nset;
5309
5310 assert(thread->sched_pri <= MAXPRI);
5311
5312 /*
5313 * Prefer the hinted processor, when appropriate.
5314 */
5315
5316 /* Fold last processor hint from secondary processor to its primary */
5317 if (processor != PROCESSOR_NULL) {
5318 processor = processor->processor_primary;
5319 }
5320
5321 /*
5322 * Only consult platform layer if pset is active, which
5323 * it may not be in some cases when a multi-set system
5324 * is going to sleep.
5325 */
5326 if (pset->online_processor_count) {
5327 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
5328 processor_t mc_processor = machine_choose_processor(pset, processor);
5329 if (mc_processor != PROCESSOR_NULL) {
5330 processor = mc_processor->processor_primary;
5331 }
5332 }
5333 }
5334
5335 /*
5336 * At this point, we may have a processor hint, and we may have
5337 * an initial starting pset. If the hint is not in the pset, or
5338 * if the hint is for a processor in an invalid state, discard
5339 * the hint.
5340 */
5341 if (processor != PROCESSOR_NULL) {
5342 if (processor->processor_set != pset) {
5343 processor = PROCESSOR_NULL;
5344 } else if (!processor->is_recommended) {
5345 processor = PROCESSOR_NULL;
5346 } else {
5347 switch (processor->state) {
5348 case PROCESSOR_START:
5349 case PROCESSOR_PENDING_OFFLINE:
5350 case PROCESSOR_OFF_LINE:
5351 /*
5352 * Hint is for a processor that cannot support running new threads.
5353 */
5354 processor = PROCESSOR_NULL;
5355 break;
5356 case PROCESSOR_IDLE:
5357 /*
5358 * Hint is for an idle processor. Assume it is no worse than any other
5359 * idle processor. The platform layer had an opportunity to provide
5360 * the "least cost idle" processor above.
5361 */
5362 if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5363 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5364 uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5365 /*
5366 * If the rotation bitmask to force a migration is set for this core and there's an idle core that
5367 * that needn't be avoided, don't continue running on the same core.
5368 */
5369 if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_primary_map != 0)) {
5370 return processor;
5371 }
5372 }
5373 processor = PROCESSOR_NULL;
5374 break;
5375 case PROCESSOR_RUNNING:
5376 case PROCESSOR_DISPATCHING:
5377 /*
5378 * Hint is for an active CPU. This fast-path allows
5379 * realtime threads to preempt non-realtime threads
5380 * to regain their previous executing processor.
5381 */
5382 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5383 if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5384 return processor;
5385 }
5386 processor = PROCESSOR_NULL;
5387 }
5388
5389 /* Otherwise, use hint as part of search below */
5390 break;
5391 default:
5392 processor = PROCESSOR_NULL;
5393 break;
5394 }
5395 }
5396 }
5397
5398 /*
5399 * Iterate through the processor sets to locate
5400 * an appropriate processor. Seed results with
5401 * a last-processor hint, if available, so that
5402 * a search must find something strictly better
5403 * to replace it.
5404 *
5405 * A primary/secondary pair of SMT processors are
5406 * "unpaired" if the primary is busy but its
5407 * corresponding secondary is idle (so the physical
5408 * core has full use of its resources).
5409 */
5410
5411 integer_t lowest_priority = MAXPRI + 1;
5412 integer_t lowest_secondary_priority = MAXPRI + 1;
5413 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
5414 integer_t lowest_idle_secondary_priority = MAXPRI + 1;
5415 integer_t lowest_count = INT_MAX;
5416 processor_t lp_processor = PROCESSOR_NULL;
5417 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
5418 processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
5419 processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
5420 processor_t lc_processor = PROCESSOR_NULL;
5421
5422 if (processor != PROCESSOR_NULL) {
5423 /* All other states should be enumerated above. */
5424 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5425 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5426
5427 lowest_priority = processor->current_pri;
5428 lp_processor = processor;
5429
5430 lowest_count = SCHED(processor_runq_count)(processor);
5431 lc_processor = processor;
5432 }
5433
5434 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5435 pset_node_t node = pset->node;
5436 bool include_ast_urgent_pending_cpus = false;
5437 cpumap_t ast_urgent_pending;
5438 try_again:
5439 ast_urgent_pending = 0;
5440 int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
5441 for (; consider_secondaries < 2; consider_secondaries++) {
5442 pset = change_locked_pset(pset, starting_pset);
5443 do {
5444 cpumap_t available_map = pset_available_cpumap(pset);
5445 if (available_map == 0) {
5446 goto no_available_cpus;
5447 }
5448
5449 processor = choose_processor_for_realtime_thread_smt(pset, PROCESSOR_NULL, consider_secondaries, false);
5450 if (processor) {
5451 return processor;
5452 }
5453
5454 if (consider_secondaries) {
5455 processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5456 if (processor) {
5457 /*
5458 * Instead of looping through all the psets to find the global
5459 * furthest deadline processor, preempt the first candidate found.
5460 * The preempted thread will then find any other available far deadline
5461 * processors to preempt.
5462 */
5463 return processor;
5464 }
5465
5466 ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5467
5468 if (rt_runq_count(pset) < lowest_count) {
5469 int cpuid = bit_first(available_map);
5470 assert(cpuid >= 0);
5471 lc_processor = processor_array[cpuid];
5472 lowest_count = rt_runq_count(pset);
5473 }
5474 }
5475
5476 no_available_cpus:
5477 nset = next_pset(pset);
5478
5479 if (nset != starting_pset) {
5480 pset = change_locked_pset(pset, nset);
5481 }
5482 } while (nset != starting_pset);
5483 }
5484
5485 /* Short cut for single pset nodes */
5486 if (bit_count(node->pset_map) == 1) {
5487 if (lc_processor) {
5488 pset_assert_locked(lc_processor->processor_set);
5489 return lc_processor;
5490 }
5491 } else {
5492 if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5493 /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5494 include_ast_urgent_pending_cpus = true;
5495 goto try_again;
5496 }
5497 }
5498
5499 processor = lc_processor;
5500
5501 if (processor) {
5502 pset = change_locked_pset(pset, processor->processor_set);
5503 /* Check that chosen processor is still usable */
5504 cpumap_t available_map = pset_available_cpumap(pset);
5505 if (bit_test(available_map, processor->cpu_id)) {
5506 return processor;
5507 }
5508
5509 /* processor is no longer usable */
5510 processor = PROCESSOR_NULL;
5511 }
5512
5513 pset_assert_locked(pset);
5514 pset_unlock(pset);
5515 return PROCESSOR_NULL;
5516 }
5517
5518 /* No realtime threads from this point on */
5519 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5520
5521 do {
5522 /*
5523 * Choose an idle processor, in pset traversal order
5524 */
5525 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5526 uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask;
5527
5528 /* there shouldn't be a pending AST if the processor is idle */
5529 assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5530
5531 /*
5532 * Look at the preferred cores first.
5533 */
5534 int cpuid = lsb_next(preferred_idle_primary_map, pset->cpu_preferred_last_chosen);
5535 if (cpuid < 0) {
5536 cpuid = lsb_first(preferred_idle_primary_map);
5537 }
5538 if (cpuid >= 0) {
5539 processor = processor_array[cpuid];
5540 pset->cpu_preferred_last_chosen = cpuid;
5541 return processor;
5542 }
5543
5544 /*
5545 * Look at the cores that don't need to be avoided next.
5546 */
5547 if (pset->perfcontrol_cpu_migration_bitmask != 0) {
5548 uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5549 cpuid = lsb_next(non_avoided_idle_primary_map, pset->cpu_preferred_last_chosen);
5550 if (cpuid < 0) {
5551 cpuid = lsb_first(non_avoided_idle_primary_map);
5552 }
5553 if (cpuid >= 0) {
5554 processor = processor_array[cpuid];
5555 pset->cpu_preferred_last_chosen = cpuid;
5556 return processor;
5557 }
5558 }
5559
5560 /*
5561 * Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available.
5562 */
5563 cpuid = lsb_first(idle_primary_map);
5564 if (cpuid >= 0) {
5565 processor = processor_array[cpuid];
5566 return processor;
5567 }
5568
5569 /*
5570 * Otherwise, enumerate active and idle processors to find primary candidates
5571 * with lower priority/etc.
5572 */
5573
5574 uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5575 pset->recommended_bitmask &
5576 ~pset->pending_AST_URGENT_cpu_mask);
5577
5578 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5579 active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5580 }
5581
5582 active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5583 for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5584 cpuid = ((rotid + pset->last_chosen + 1) & 63);
5585 processor = processor_array[cpuid];
5586
5587 integer_t cpri = processor->current_pri;
5588 processor_t primary = processor->processor_primary;
5589 if (primary != processor) {
5590 /* If primary is running a NO_SMT thread, don't choose its secondary */
5591 if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
5592 if (cpri < lowest_secondary_priority) {
5593 lowest_secondary_priority = cpri;
5594 lp_paired_secondary_processor = processor;
5595 }
5596 }
5597 } else {
5598 if (cpri < lowest_priority) {
5599 lowest_priority = cpri;
5600 lp_processor = processor;
5601 }
5602 }
5603
5604 integer_t ccount = SCHED(processor_runq_count)(processor);
5605 if (ccount < lowest_count) {
5606 lowest_count = ccount;
5607 lc_processor = processor;
5608 }
5609 }
5610
5611 /*
5612 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5613 * the idle primary would have short-circuited the loop above
5614 */
5615 uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5616 ~pset->primary_map &
5617 pset->recommended_bitmask);
5618
5619 /* there shouldn't be a pending AST if the processor is idle */
5620 assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5621 assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5622
5623 for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
5624 processor = processor_array[cpuid];
5625
5626 processor_t cprimary = processor->processor_primary;
5627
5628 integer_t primary_pri = cprimary->current_pri;
5629
5630 /*
5631 * TODO: This should also make the same decisions
5632 * as secondary_can_run_realtime_thread
5633 *
5634 * TODO: Keep track of the pending preemption priority
5635 * of the primary to make this more accurate.
5636 */
5637
5638 /* If the primary is running a no-smt thread, then don't choose its secondary */
5639 if (cprimary->state == PROCESSOR_RUNNING &&
5640 processor_active_thread_no_smt(cprimary)) {
5641 continue;
5642 }
5643
5644 /*
5645 * Find the idle secondary processor with the lowest priority primary
5646 *
5647 * We will choose this processor as a fallback if we find no better
5648 * primary to preempt.
5649 */
5650 if (primary_pri < lowest_idle_secondary_priority) {
5651 lp_idle_secondary_processor = processor;
5652 lowest_idle_secondary_priority = primary_pri;
5653 }
5654
5655 /* Find the the lowest priority active primary with idle secondary */
5656 if (primary_pri < lowest_unpaired_primary_priority) {
5657 /* If the primary processor is offline or starting up, it's not a candidate for this path */
5658 if (cprimary->state != PROCESSOR_RUNNING &&
5659 cprimary->state != PROCESSOR_DISPATCHING) {
5660 continue;
5661 }
5662
5663 if (!cprimary->is_recommended) {
5664 continue;
5665 }
5666
5667 /* if the primary is pending preemption, don't try to re-preempt it */
5668 if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5669 continue;
5670 }
5671
5672 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5673 bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5674 continue;
5675 }
5676
5677 lowest_unpaired_primary_priority = primary_pri;
5678 lp_unpaired_primary_processor = cprimary;
5679 }
5680 }
5681
5682 /*
5683 * We prefer preempting a primary processor over waking up its secondary.
5684 * The secondary will then be woken up by the preempted thread.
5685 */
5686 if (thread->sched_pri > lowest_unpaired_primary_priority) {
5687 pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5688 return lp_unpaired_primary_processor;
5689 }
5690
5691 /*
5692 * We prefer preempting a lower priority active processor over directly
5693 * waking up an idle secondary.
5694 * The preempted thread will then find the idle secondary.
5695 */
5696 if (thread->sched_pri > lowest_priority) {
5697 pset->last_chosen = lp_processor->cpu_id;
5698 return lp_processor;
5699 }
5700
5701 /*
5702 * lc_processor is used to indicate the best processor set run queue
5703 * on which to enqueue a thread when all available CPUs are busy with
5704 * higher priority threads, so try to make sure it is initialized.
5705 */
5706 if (lc_processor == PROCESSOR_NULL) {
5707 cpumap_t available_map = pset_available_cpumap(pset);
5708 cpuid = lsb_first(available_map);
5709 if (cpuid >= 0) {
5710 lc_processor = processor_array[cpuid];
5711 lowest_count = SCHED(processor_runq_count)(lc_processor);
5712 }
5713 }
5714
5715 /*
5716 * Move onto the next processor set.
5717 *
5718 * If all primary processors in this pset are running a higher
5719 * priority thread, move on to next pset. Only when we have
5720 * exhausted the search for primary processors do we
5721 * fall back to secondaries.
5722 */
5723 #if CONFIG_SCHED_EDGE
5724 /*
5725 * The edge scheduler expects a CPU to be selected from the pset it passed in
5726 * as the starting pset for non-RT workloads. The edge migration algorithm
5727 * should already have considered idle CPUs and loads to decide the starting_pset;
5728 * which means that this loop can be short-circuted.
5729 */
5730 nset = starting_pset;
5731 #else /* CONFIG_SCHED_EDGE */
5732 nset = next_pset(pset);
5733 #endif /* CONFIG_SCHED_EDGE */
5734
5735 if (nset != starting_pset) {
5736 pset = change_locked_pset(pset, nset);
5737 }
5738 } while (nset != starting_pset);
5739
5740 /*
5741 * Make sure that we pick a running processor,
5742 * and that the correct processor set is locked.
5743 * Since we may have unlocked the candidate processor's
5744 * pset, it may have changed state.
5745 *
5746 * All primary processors are running a higher priority
5747 * thread, so the only options left are enqueuing on
5748 * the secondary processor that would perturb the least priority
5749 * primary, or the least busy primary.
5750 */
5751
5752 /* lowest_priority is evaluated in the main loops above */
5753 if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5754 processor = lp_idle_secondary_processor;
5755 } else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5756 processor = lp_paired_secondary_processor;
5757 } else if (lc_processor != PROCESSOR_NULL) {
5758 processor = lc_processor;
5759 } else {
5760 processor = PROCESSOR_NULL;
5761 }
5762
5763 if (processor) {
5764 pset = change_locked_pset(pset, processor->processor_set);
5765 /* Check that chosen processor is still usable */
5766 cpumap_t available_map = pset_available_cpumap(pset);
5767 if (bit_test(available_map, processor->cpu_id)) {
5768 pset->last_chosen = processor->cpu_id;
5769 return processor;
5770 }
5771
5772 /* processor is no longer usable */
5773 processor = PROCESSOR_NULL;
5774 }
5775
5776 pset_assert_locked(pset);
5777 pset_unlock(pset);
5778 return PROCESSOR_NULL;
5779 }
5780 #else /* CONFIG_SCHED_SMT */
5781 /*
5782 * choose_processor:
5783 *
5784 * Choose a processor for the thread, beginning at
5785 * the pset. Accepts an optional processor hint in
5786 * the pset.
5787 *
5788 * Returns a processor, possibly from a different pset.
5789 *
5790 * The thread must be locked. The pset must be locked,
5791 * and the resulting pset is locked on return.
5792 */
5793 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread)5794 choose_processor(
5795 processor_set_t starting_pset,
5796 processor_t processor,
5797 thread_t thread)
5798 {
5799 processor_set_t pset = starting_pset;
5800 processor_set_t nset;
5801
5802 assert(thread->sched_pri <= MAXPRI);
5803
5804 /*
5805 * At this point, we may have a processor hint, and we may have
5806 * an initial starting pset. If the hint is not in the pset, or
5807 * if the hint is for a processor in an invalid state, discard
5808 * the hint.
5809 */
5810 if (processor != PROCESSOR_NULL) {
5811 if (processor->processor_set != pset) {
5812 processor = PROCESSOR_NULL;
5813 } else if (!processor->is_recommended) {
5814 processor = PROCESSOR_NULL;
5815 } else {
5816 switch (processor->state) {
5817 case PROCESSOR_START:
5818 case PROCESSOR_PENDING_OFFLINE:
5819 case PROCESSOR_OFF_LINE:
5820 /*
5821 * Hint is for a processor that cannot support running new threads.
5822 */
5823 processor = PROCESSOR_NULL;
5824 break;
5825 case PROCESSOR_IDLE:
5826 /*
5827 * Hint is for an idle processor. Assume it is no worse than any other
5828 * idle processor. The platform layer had an opportunity to provide
5829 * the "least cost idle" processor above.
5830 */
5831 if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5832 uint64_t idle_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->recommended_bitmask);
5833 uint64_t non_avoided_idle_map = idle_map & ~pset->perfcontrol_cpu_migration_bitmask;
5834 /*
5835 * If the rotation bitmask to force a migration is set for this core and there's an idle core that
5836 * that needn't be avoided, don't continue running on the same core.
5837 */
5838 if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_map != 0)) {
5839 return processor;
5840 }
5841 }
5842 processor = PROCESSOR_NULL;
5843 break;
5844 case PROCESSOR_RUNNING:
5845 case PROCESSOR_DISPATCHING:
5846 /*
5847 * Hint is for an active CPU. This fast-path allows
5848 * realtime threads to preempt non-realtime threads
5849 * to regain their previous executing processor.
5850 */
5851 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5852 if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5853 return processor;
5854 }
5855 processor = PROCESSOR_NULL;
5856 }
5857
5858 /* Otherwise, use hint as part of search below */
5859 break;
5860 default:
5861 processor = PROCESSOR_NULL;
5862 break;
5863 }
5864 }
5865 }
5866
5867 /*
5868 * Iterate through the processor sets to locate
5869 * an appropriate processor. Seed results with
5870 * a last-processor hint, if available, so that
5871 * a search must find something strictly better
5872 * to replace it.
5873 */
5874
5875 integer_t lowest_priority = MAXPRI + 1;
5876 integer_t lowest_count = INT_MAX;
5877 processor_t lp_processor = PROCESSOR_NULL;
5878 processor_t lc_processor = PROCESSOR_NULL;
5879
5880 if (processor != PROCESSOR_NULL) {
5881 /* All other states should be enumerated above. */
5882 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5883 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5884
5885 lowest_priority = processor->current_pri;
5886 lp_processor = processor;
5887
5888 lowest_count = SCHED(processor_runq_count)(processor);
5889 lc_processor = processor;
5890 }
5891
5892 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5893 pset_node_t node = pset->node;
5894 bool include_ast_urgent_pending_cpus = false;
5895 cpumap_t ast_urgent_pending;
5896 try_again:
5897 ast_urgent_pending = 0;
5898 pset = change_locked_pset(pset, starting_pset);
5899 do {
5900 cpumap_t available_map = pset_available_cpumap(pset);
5901 if (available_map == 0) {
5902 goto no_available_cpus;
5903 }
5904
5905 processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false);
5906 if (processor) {
5907 return processor;
5908 }
5909
5910 processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5911 if (processor) {
5912 /*
5913 * Instead of looping through all the psets to find the global
5914 * furthest deadline processor, preempt the first candidate found.
5915 * The preempted thread will then find any other available far deadline
5916 * processors to preempt.
5917 */
5918 return processor;
5919 }
5920
5921 ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5922
5923 if (rt_runq_count(pset) < lowest_count) {
5924 int cpuid = bit_first(available_map);
5925 assert(cpuid >= 0);
5926 lc_processor = processor_array[cpuid];
5927 lowest_count = rt_runq_count(pset);
5928 }
5929
5930 no_available_cpus:
5931 nset = next_pset(pset);
5932
5933 if (nset != starting_pset) {
5934 pset = change_locked_pset(pset, nset);
5935 }
5936 } while (nset != starting_pset);
5937
5938
5939 /* Short cut for single pset nodes */
5940 if (bit_count(node->pset_map) == 1) {
5941 if (lc_processor) {
5942 pset_assert_locked(lc_processor->processor_set);
5943 return lc_processor;
5944 }
5945 } else {
5946 if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5947 /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5948 include_ast_urgent_pending_cpus = true;
5949 goto try_again;
5950 }
5951 }
5952
5953 processor = lc_processor;
5954
5955 if (processor) {
5956 pset = change_locked_pset(pset, processor->processor_set);
5957 /* Check that chosen processor is still usable */
5958 cpumap_t available_map = pset_available_cpumap(pset);
5959 if (bit_test(available_map, processor->cpu_id)) {
5960 return processor;
5961 }
5962
5963 /* processor is no longer usable */
5964 processor = PROCESSOR_NULL;
5965 }
5966
5967 pset_assert_locked(pset);
5968 pset_unlock(pset);
5969 return PROCESSOR_NULL;
5970 }
5971
5972 /* No realtime threads from this point on */
5973 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5974
5975 do {
5976 /*
5977 * Choose an idle processor, in pset traversal order
5978 */
5979 uint64_t idle_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->recommended_bitmask);
5980 uint64_t preferred_idle_map = idle_map & pset->perfcontrol_cpu_preferred_bitmask;
5981
5982 /* there shouldn't be a pending AST if the processor is idle */
5983 assert((idle_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5984
5985 /*
5986 * Look at the preferred cores first.
5987 */
5988 int cpuid = lsb_next(preferred_idle_map, pset->cpu_preferred_last_chosen);
5989 if (cpuid < 0) {
5990 cpuid = lsb_first(preferred_idle_map);
5991 }
5992 if (cpuid >= 0) {
5993 processor = processor_array[cpuid];
5994 pset->cpu_preferred_last_chosen = cpuid;
5995 return processor;
5996 }
5997
5998 /*
5999 * Look at the cores that don't need to be avoided next.
6000 */
6001 if (pset->perfcontrol_cpu_migration_bitmask != 0) {
6002 uint64_t non_avoided_idle_map = idle_map & ~pset->perfcontrol_cpu_migration_bitmask;
6003 cpuid = lsb_next(non_avoided_idle_map, pset->cpu_preferred_last_chosen);
6004 if (cpuid < 0) {
6005 cpuid = lsb_first(non_avoided_idle_map);
6006 }
6007 if (cpuid >= 0) {
6008 processor = processor_array[cpuid];
6009 pset->cpu_preferred_last_chosen = cpuid;
6010 return processor;
6011 }
6012 }
6013
6014 /*
6015 * Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available.
6016 */
6017 cpuid = lsb_first(idle_map);
6018 if (cpuid >= 0) {
6019 processor = processor_array[cpuid];
6020 return processor;
6021 }
6022
6023 /*
6024 * Otherwise, enumerate active and idle processors to find primary candidates
6025 * with lower priority/etc.
6026 */
6027
6028 uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
6029 pset->recommended_bitmask &
6030 ~pset->pending_AST_URGENT_cpu_mask);
6031
6032 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
6033 active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
6034 }
6035
6036 active_map = bit_ror64(active_map, (pset->last_chosen + 1));
6037 for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
6038 cpuid = ((rotid + pset->last_chosen + 1) & 63);
6039 processor = processor_array[cpuid];
6040
6041 integer_t cpri = processor->current_pri;
6042 if (cpri < lowest_priority) {
6043 lowest_priority = cpri;
6044 lp_processor = processor;
6045 }
6046
6047 integer_t ccount = SCHED(processor_runq_count)(processor);
6048 if (ccount < lowest_count) {
6049 lowest_count = ccount;
6050 lc_processor = processor;
6051 }
6052 }
6053
6054 /*
6055 * We prefer preempting a lower priority active processor over directly
6056 * waking up an idle secondary.
6057 * The preempted thread will then find the idle secondary.
6058 */
6059 if (thread->sched_pri > lowest_priority) {
6060 pset->last_chosen = lp_processor->cpu_id;
6061 return lp_processor;
6062 }
6063
6064 /*
6065 * lc_processor is used to indicate the best processor set run queue
6066 * on which to enqueue a thread when all available CPUs are busy with
6067 * higher priority threads, so try to make sure it is initialized.
6068 */
6069 if (lc_processor == PROCESSOR_NULL) {
6070 cpumap_t available_map = pset_available_cpumap(pset);
6071 cpuid = lsb_first(available_map);
6072 if (cpuid >= 0) {
6073 lc_processor = processor_array[cpuid];
6074 lowest_count = SCHED(processor_runq_count)(lc_processor);
6075 }
6076 }
6077
6078 /*
6079 * Move onto the next processor set.
6080 *
6081 * If all primary processors in this pset are running a higher
6082 * priority thread, move on to next pset. Only when we have
6083 * exhausted the search for primary processors do we
6084 * fall back to secondaries.
6085 */
6086 #if CONFIG_SCHED_EDGE
6087 /*
6088 * The edge scheduler expects a CPU to be selected from the pset it passed in
6089 * as the starting pset for non-RT workloads. The edge migration algorithm
6090 * should already have considered idle CPUs and loads to decide the starting_pset;
6091 * which means that this loop can be short-circuted.
6092 */
6093 nset = starting_pset;
6094 #else /* CONFIG_SCHED_EDGE */
6095 nset = next_pset(pset);
6096 #endif /* CONFIG_SCHED_EDGE */
6097
6098 if (nset != starting_pset) {
6099 pset = change_locked_pset(pset, nset);
6100 }
6101 } while (nset != starting_pset);
6102
6103 processor = lc_processor;
6104
6105 if (processor) {
6106 pset = change_locked_pset(pset, processor->processor_set);
6107 /* Check that chosen processor is still usable */
6108 cpumap_t available_map = pset_available_cpumap(pset);
6109 if (bit_test(available_map, processor->cpu_id)) {
6110 pset->last_chosen = processor->cpu_id;
6111 return processor;
6112 }
6113
6114 /* processor is no longer usable */
6115 processor = PROCESSOR_NULL;
6116 }
6117
6118 pset_assert_locked(pset);
6119 pset_unlock(pset);
6120 return PROCESSOR_NULL;
6121 }
6122 #endif /* CHOOSE_PROCESSOR_SMT*/
6123
6124
6125
6126 /*
6127 * Default implementation of SCHED(choose_node)()
6128 * for single node systems
6129 */
6130 pset_node_t
sched_choose_node(__unused thread_t thread)6131 sched_choose_node(__unused thread_t thread)
6132 {
6133 return &pset_node0;
6134 }
6135
6136 /*
6137 * choose_starting_pset:
6138 *
6139 * Choose a starting processor set for the thread.
6140 * May return a processor hint within the pset.
6141 *
6142 * Returns a starting processor set, to be used by
6143 * choose_processor.
6144 *
6145 * The thread must be locked. The resulting pset is unlocked on return,
6146 * and is chosen without taking any pset locks.
6147 */
6148 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)6149 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
6150 {
6151 processor_set_t pset;
6152 processor_t processor = PROCESSOR_NULL;
6153
6154 if (thread->affinity_set != AFFINITY_SET_NULL) {
6155 /*
6156 * Use affinity set policy hint.
6157 */
6158 pset = thread->affinity_set->aset_pset;
6159 } else if (thread->last_processor != PROCESSOR_NULL) {
6160 /*
6161 * Simple (last processor) affinity case.
6162 */
6163 processor = thread->last_processor;
6164 pset = processor->processor_set;
6165 } else {
6166 /*
6167 * No Affinity case:
6168 *
6169 * Utilitize a per task hint to spread threads
6170 * among the available processor sets.
6171 * NRG this seems like the wrong thing to do.
6172 * See also task->pset_hint = pset in thread_setrun()
6173 */
6174 pset = get_threadtask(thread)->pset_hint;
6175 if (pset == PROCESSOR_SET_NULL) {
6176 pset = current_processor()->processor_set;
6177 }
6178
6179 pset = choose_next_pset(pset);
6180 }
6181
6182 if (!bit_test(node->pset_map, pset->pset_id)) {
6183 /* pset is not from this node so choose one that is */
6184 int id = lsb_first(node->pset_map);
6185 if (id < 0) {
6186 /* startup race, so check again under the node lock */
6187 lck_spin_lock(&pset_node_lock);
6188 if (bit_test(node->pset_map, pset->pset_id)) {
6189 id = pset->pset_id;
6190 } else {
6191 id = lsb_first(node->pset_map);
6192 }
6193 lck_spin_unlock(&pset_node_lock);
6194 }
6195 assert(id >= 0);
6196 pset = pset_array[id];
6197 }
6198
6199 if (bit_count(node->pset_map) == 1) {
6200 /* Only a single pset in this node */
6201 goto out;
6202 }
6203
6204 bool avoid_cpu0 = false;
6205
6206 #if defined(__x86_64__)
6207 if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
6208 /* Avoid the pset containing cpu0 */
6209 avoid_cpu0 = true;
6210 /* Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ */
6211 assert(bit_test(pset_array[0]->cpu_bitmask, 0));
6212 }
6213 #endif
6214
6215 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
6216 pset_map_t rt_target_map;
6217 #if CONFIG_SCHED_SMT
6218 rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
6219 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
6220 if (avoid_cpu0) {
6221 rt_target_map = bit_ror64(rt_target_map, 1);
6222 }
6223 int rotid = lsb_first(rt_target_map);
6224 if (rotid >= 0) {
6225 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
6226 pset = pset_array[id];
6227 goto out;
6228 }
6229 }
6230 if (!pset->is_SMT || !sched_allow_rt_smt) {
6231 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
6232 goto out;
6233 }
6234 #endif /* CONFIG_SCHED_SMT*/
6235 rt_target_map = atomic_load(&node->pset_non_rt_map);
6236 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
6237 if (avoid_cpu0) {
6238 rt_target_map = bit_ror64(rt_target_map, 1);
6239 }
6240 int rotid = lsb_first(rt_target_map);
6241 if (rotid >= 0) {
6242 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
6243 pset = pset_array[id];
6244 goto out;
6245 }
6246 }
6247 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
6248 } else {
6249 pset_map_t idle_map = atomic_load(&node->pset_idle_map);
6250 if (!bit_test(idle_map, pset->pset_id)) {
6251 int next_idle_pset_id = lsb_first(idle_map);
6252 if (next_idle_pset_id >= 0) {
6253 pset = pset_array[next_idle_pset_id];
6254 }
6255 }
6256 }
6257
6258 out:
6259 if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
6260 processor = PROCESSOR_NULL;
6261 }
6262 if (processor != PROCESSOR_NULL) {
6263 *processor_hint = processor;
6264 }
6265
6266 assert(pset != NULL);
6267 return pset;
6268 }
6269
6270 /*
6271 * thread_setrun:
6272 *
6273 * Dispatch thread for execution, onto an idle
6274 * processor or run queue, and signal a preemption
6275 * as appropriate.
6276 *
6277 * Thread must be locked.
6278 */
6279 void
thread_setrun(thread_t thread,sched_options_t options)6280 thread_setrun(
6281 thread_t thread,
6282 sched_options_t options)
6283 {
6284 processor_t processor = PROCESSOR_NULL;
6285 processor_set_t pset;
6286
6287 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
6288 thread_assert_runq_null(thread);
6289
6290 #if CONFIG_PREADOPT_TG
6291 /* We know that the thread is not in the runq by virtue of being in this
6292 * function and the thread is not self since we are running. We can safely
6293 * resolve the thread group hierarchy and modify the thread's thread group
6294 * here. */
6295 thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
6296 #endif
6297
6298 /*
6299 * Update priority if needed.
6300 */
6301 if (SCHED(can_update_priority)(thread)) {
6302 SCHED(update_priority)(thread);
6303 }
6304 thread->sfi_class = sfi_thread_classify(thread);
6305
6306 if (thread->bound_processor == PROCESSOR_NULL) {
6307 /*
6308 * Unbound case.
6309 *
6310 * Usually, this loop will only be executed once,
6311 * but if CLPC derecommends a processor after it has been chosen,
6312 * or if a processor is shut down after it is chosen,
6313 * choose_processor() may return NULL, so a retry
6314 * may be necessary. A single retry will usually
6315 * be enough, and we can't afford to retry too many times
6316 * because interrupts are disabled.
6317 */
6318 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
6319 for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
6320 processor_t processor_hint = PROCESSOR_NULL;
6321 pset_node_t node = SCHED(choose_node)(thread);
6322 processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
6323
6324 pset_lock(starting_pset);
6325
6326 processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
6327 if (processor != PROCESSOR_NULL) {
6328 pset = processor->processor_set;
6329 pset_assert_locked(pset);
6330 break;
6331 }
6332 }
6333 /*
6334 * If choose_processor() still returns NULL,
6335 * which is very unlikely, we need a fallback.
6336 */
6337 if (processor == PROCESSOR_NULL) {
6338 bool unlock_available_cores_lock = false;
6339 if (sched_all_cpus_offline()) {
6340 /*
6341 * There are no available processors
6342 * because we're in final system shutdown.
6343 * Enqueue on the master processor and we'll
6344 * handle it when it powers back up.
6345 */
6346 processor = master_processor;
6347 } else if (support_bootcpu_shutdown) {
6348 /*
6349 * Grab the sched_available_cores_lock to select
6350 * some available processor and prevent it from
6351 * becoming offline while we enqueue the thread.
6352 *
6353 * This is very close to a lock inversion, but
6354 * places that do call thread_setrun with this
6355 * lock held know that the current cpu will be
6356 * schedulable, so we won't fall out of
6357 * choose_processor.
6358 */
6359 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
6360 unlock_available_cores_lock = true;
6361
6362 int last_resort_cpu = sched_last_resort_cpu();
6363
6364 processor = processor_array[last_resort_cpu];
6365 } else {
6366 /*
6367 * The master processor is never shut down, always safe to choose.
6368 */
6369 processor = master_processor;
6370 }
6371 pset = processor->processor_set;
6372 pset_lock(pset);
6373 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
6374 if (unlock_available_cores_lock) {
6375 simple_unlock(&sched_available_cores_lock);
6376 }
6377 }
6378 task_t task = get_threadtask(thread);
6379 if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
6380 task->pset_hint = pset; /* NRG this is done without holding the task lock */
6381 }
6382 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
6383 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
6384 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
6385 } else {
6386 /*
6387 * Bound case:
6388 *
6389 * Unconditionally dispatch on the processor.
6390 */
6391 processor = thread->bound_processor;
6392 pset = processor->processor_set;
6393 pset_lock(pset);
6394
6395 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
6396 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
6397 }
6398
6399 /*
6400 * Dispatch the thread on the chosen processor.
6401 * TODO: This should be based on sched_mode, not sched_pri
6402 */
6403 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
6404 realtime_setrun(processor, thread);
6405 } else {
6406 processor_setrun(processor, thread, options);
6407 }
6408 /* pset is now unlocked */
6409 if (thread->bound_processor == PROCESSOR_NULL) {
6410 SCHED(check_spill)(pset, thread);
6411 }
6412 }
6413
6414 processor_set_t
task_choose_pset(task_t task)6415 task_choose_pset(
6416 task_t task)
6417 {
6418 processor_set_t pset = task->pset_hint;
6419
6420 if (pset != PROCESSOR_SET_NULL) {
6421 pset = choose_next_pset(pset);
6422 }
6423
6424 return pset;
6425 }
6426
6427 /*
6428 * Check for a preemption point in
6429 * the current context.
6430 *
6431 * Called at splsched with thread locked.
6432 */
6433 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)6434 csw_check(
6435 thread_t thread,
6436 processor_t processor,
6437 ast_t check_reason)
6438 {
6439 processor_set_t pset = processor->processor_set;
6440
6441 assert(thread == processor->active_thread);
6442
6443 pset_lock(pset);
6444
6445 processor_state_update_from_thread(processor, thread, true);
6446
6447 ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
6448
6449 /* Acknowledge the IPI if we decided not to preempt */
6450
6451 if ((preempt & AST_URGENT) == 0) {
6452 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6453 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
6454 }
6455 }
6456
6457 if ((preempt & AST_PREEMPT) == 0) {
6458 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
6459 }
6460
6461 pset_unlock(pset);
6462
6463 return update_pending_nonurgent_preemption(processor, preempt);
6464 }
6465
6466 void
clear_pending_nonurgent_preemption(processor_t processor)6467 clear_pending_nonurgent_preemption(processor_t processor)
6468 {
6469 if (!processor->pending_nonurgent_preemption) {
6470 return;
6471 }
6472
6473 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_END);
6474
6475 processor->pending_nonurgent_preemption = false;
6476 running_timer_clear(processor, RUNNING_TIMER_PREEMPT);
6477 }
6478
6479 ast_t
update_pending_nonurgent_preemption(processor_t processor,ast_t reason)6480 update_pending_nonurgent_preemption(processor_t processor, ast_t reason)
6481 {
6482 if ((reason & (AST_URGENT | AST_PREEMPT)) != (AST_PREEMPT)) {
6483 clear_pending_nonurgent_preemption(processor);
6484 return reason;
6485 }
6486
6487 if (nonurgent_preemption_timer_abs == 0) {
6488 /* Preemption timer not enabled */
6489 return reason;
6490 }
6491
6492 if (current_thread()->state & TH_IDLE) {
6493 /* idle threads don't need nonurgent preemption */
6494 return reason;
6495 }
6496
6497 if (processor->pending_nonurgent_preemption) {
6498 /* Timer is already armed, no need to do it again */
6499 return reason;
6500 }
6501
6502 if (ml_did_interrupt_userspace()) {
6503 /*
6504 * We're preempting userspace here, so we don't need
6505 * to defer the preemption. Force AST_URGENT
6506 * so that we can avoid arming this timer without risking
6507 * ast_taken_user deciding to spend too long in kernel
6508 * space to handle other ASTs.
6509 */
6510
6511 return reason | AST_URGENT;
6512 }
6513
6514 /*
6515 * We've decided to do a nonurgent preemption when running in
6516 * kernelspace. We defer the preemption until reaching userspace boundary
6517 * to give a grace period for locks etc to be dropped and to reach
6518 * a clean preemption point, so that the preempting thread doesn't
6519 * always immediately hit the lock that the waking thread still holds.
6520 *
6521 * Arm a timer to enforce that the preemption executes within a bounded
6522 * time if the thread doesn't block or return to userspace quickly.
6523 */
6524
6525 processor->pending_nonurgent_preemption = true;
6526 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_START,
6527 reason);
6528
6529 uint64_t now = mach_absolute_time();
6530
6531 uint64_t deadline = now + nonurgent_preemption_timer_abs;
6532
6533 running_timer_enter(processor, RUNNING_TIMER_PREEMPT, NULL,
6534 now, deadline);
6535
6536 return reason;
6537 }
6538
6539 /*
6540 * Check for preemption at splsched with
6541 * pset locked and processor as the current
6542 * processor.
6543 */
6544 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)6545 csw_check_locked(
6546 thread_t thread,
6547 processor_t processor,
6548 processor_set_t pset,
6549 ast_t check_reason)
6550 {
6551 assert(processor == current_processor());
6552 /*
6553 * If the current thread is running on a processor that is no longer recommended,
6554 * urgently preempt it, at which point thread_select() should
6555 * try to idle the processor and re-dispatch the thread to a recommended processor.
6556 */
6557 if (!processor->is_recommended) {
6558 return check_reason | AST_PREEMPT | AST_URGENT;
6559 }
6560
6561 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6562 return check_reason | AST_PREEMPT | AST_URGENT;
6563 }
6564
6565 if (rt_runq_count(pset) > 0) {
6566 if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
6567 return check_reason | AST_PREEMPT | AST_URGENT;
6568 } else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) {
6569 return check_reason | AST_PREEMPT | AST_URGENT;
6570 } else {
6571 return check_reason | AST_PREEMPT;
6572 }
6573 }
6574
6575 ast_t result = SCHED(processor_csw_check)(processor);
6576 if (result != AST_NONE) {
6577 return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
6578 }
6579
6580 /*
6581 * Same for avoid-processor
6582 *
6583 * TODO: Should these set AST_REBALANCE?
6584 */
6585 if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, check_reason)) {
6586 return check_reason | AST_PREEMPT;
6587 }
6588
6589 #if CONFIG_SCHED_SMT
6590 /*
6591 * Even though we could continue executing on this processor, a
6592 * secondary SMT core should try to shed load to another primary core.
6593 *
6594 * TODO: Should this do the same check that thread_select does? i.e.
6595 * if no bound threads target this processor, and idle primaries exist, preempt
6596 * The case of RT threads existing is already taken care of above
6597 */
6598
6599 if (processor->current_pri < BASEPRI_RTQUEUES &&
6600 processor->processor_primary != processor) {
6601 return check_reason | AST_PREEMPT;
6602 }
6603 #endif /* CONFIG_SCHED_SMT*/
6604
6605 if (thread->state & TH_SUSP) {
6606 return check_reason | AST_PREEMPT;
6607 }
6608
6609 #if CONFIG_SCHED_SFI
6610 /*
6611 * Current thread may not need to be preempted, but maybe needs
6612 * an SFI wait?
6613 */
6614 result = sfi_thread_needs_ast(thread, NULL);
6615 if (result != AST_NONE) {
6616 return result;
6617 }
6618 #endif
6619
6620 return AST_NONE;
6621 }
6622
6623 /*
6624 * Handle preemption IPI or IPI in response to setting an AST flag
6625 * Triggered by cause_ast_check
6626 * Called at splsched
6627 */
6628 void
ast_check(processor_t processor)6629 ast_check(processor_t processor)
6630 {
6631 smr_ack_ipi();
6632
6633 if (processor->state != PROCESSOR_RUNNING) {
6634 return;
6635 }
6636
6637 SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6638 MACH_SCHED_AST_CHECK) | DBG_FUNC_START);
6639
6640 thread_t thread = processor->active_thread;
6641
6642 assert(thread == current_thread());
6643
6644 /*
6645 * Pairs with task_restartable_ranges_synchronize
6646 */
6647 thread_lock(thread);
6648
6649 thread_reset_pcs_ack_IPI(thread);
6650
6651 /*
6652 * Propagate thread ast to processor.
6653 * (handles IPI in response to setting AST flag)
6654 */
6655 ast_propagate(thread);
6656
6657 /*
6658 * Stash the old urgency and perfctl values to find out if
6659 * csw_check updates them.
6660 */
6661 thread_urgency_t old_urgency = processor->current_urgency;
6662 perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
6663
6664 ast_t preempt;
6665
6666 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6667 ast_on(preempt);
6668 }
6669
6670 if (old_urgency != processor->current_urgency) {
6671 /*
6672 * Urgency updates happen with the thread lock held (ugh).
6673 * TODO: This doesn't notice QoS changes...
6674 */
6675 uint64_t urgency_param1, urgency_param2;
6676
6677 thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
6678 thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
6679 }
6680
6681 thread_unlock(thread);
6682
6683 if (old_perfctl_class != processor->current_perfctl_class) {
6684 /*
6685 * We updated the perfctl class of this thread from another core.
6686 * Let CLPC know that the currently running thread has a new
6687 * class.
6688 */
6689
6690 machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
6691 mach_approximate_time(), 0, thread);
6692 }
6693
6694 SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6695 MACH_SCHED_AST_CHECK) | DBG_FUNC_END, preempt);
6696 }
6697
6698
6699 void
thread_preempt_expire(timer_call_param_t p0,__unused timer_call_param_t p1)6700 thread_preempt_expire(
6701 timer_call_param_t p0,
6702 __unused timer_call_param_t p1)
6703 {
6704 processor_t processor = p0;
6705
6706 assert(processor == current_processor());
6707 assert(p1 == NULL);
6708
6709 thread_t thread = current_thread();
6710
6711 /*
6712 * This is set and cleared by the current core, so we will
6713 * never see a race with running timer expiration
6714 */
6715 assert(processor->pending_nonurgent_preemption);
6716
6717 clear_pending_nonurgent_preemption(processor);
6718
6719 thread_lock(thread);
6720
6721 /*
6722 * Check again to see if it's still worth a
6723 * context switch, but this time force enable kernel preemption
6724 */
6725
6726 ast_t preempt = csw_check(thread, processor, AST_URGENT);
6727
6728 if (preempt) {
6729 ast_on(preempt);
6730 }
6731
6732 thread_unlock(thread);
6733
6734 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt);
6735 }
6736
6737
6738 /*
6739 * set_sched_pri:
6740 *
6741 * Set the scheduled priority of the specified thread.
6742 *
6743 * This may cause the thread to change queues.
6744 *
6745 * Thread must be locked.
6746 */
6747 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)6748 set_sched_pri(
6749 thread_t thread,
6750 int16_t new_priority,
6751 set_sched_pri_options_t options)
6752 {
6753 bool is_current_thread = (thread == current_thread());
6754 bool removed_from_runq = false;
6755 bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
6756
6757 int16_t old_priority = thread->sched_pri;
6758
6759 /* If we're already at this priority, no need to mess with the runqueue */
6760 if (new_priority == old_priority) {
6761 #if CONFIG_SCHED_CLUTCH
6762 /* For the first thread in the system, the priority is correct but
6763 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
6764 * scheduler relies on the bucket being set for all threads, update
6765 * its bucket here.
6766 */
6767 if (thread->th_sched_bucket == TH_BUCKET_RUN) {
6768 assert(thread == vm_pageout_scan_thread);
6769 SCHED(update_thread_bucket)(thread);
6770 }
6771 #endif /* CONFIG_SCHED_CLUTCH */
6772
6773 return;
6774 }
6775
6776 if (is_current_thread) {
6777 assert(thread->state & TH_RUN);
6778 thread_assert_runq_null(thread);
6779 } else {
6780 removed_from_runq = thread_run_queue_remove(thread);
6781 }
6782
6783 thread->sched_pri = new_priority;
6784
6785 #if CONFIG_SCHED_CLUTCH
6786 /*
6787 * Since for the clutch scheduler, the thread's bucket determines its runq
6788 * in the hierarchy it is important to update the bucket when the thread
6789 * lock is held and the thread has been removed from the runq hierarchy.
6790 */
6791 SCHED(update_thread_bucket)(thread);
6792
6793 #endif /* CONFIG_SCHED_CLUTCH */
6794
6795 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
6796 (uintptr_t)thread_tid(thread),
6797 thread->base_pri,
6798 thread->sched_pri,
6799 thread->sched_usage,
6800 0);
6801
6802 if (removed_from_runq) {
6803 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
6804 } else if (is_current_thread) {
6805 processor_t processor = thread->last_processor;
6806 assert(processor == current_processor());
6807
6808 thread_urgency_t old_urgency = processor->current_urgency;
6809
6810 /*
6811 * When dropping in priority, check if the thread no longer belongs on core.
6812 * If a thread raises its own priority, don't aggressively rebalance it.
6813 * <rdar://problem/31699165>
6814 *
6815 * csw_check does a processor_state_update_from_thread, but
6816 * we should do our own if we're being lazy.
6817 */
6818 if (!lazy_update && new_priority < old_priority) {
6819 ast_t preempt;
6820
6821 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6822 ast_on(preempt);
6823 }
6824 } else {
6825 processor_state_update_from_thread(processor, thread, false);
6826 }
6827
6828 /*
6829 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
6830 * class alterations from user space to occur relatively infrequently, hence
6831 * those are lazily handled. QoS classes have distinct priority bands, and QoS
6832 * inheritance is expected to involve priority changes.
6833 */
6834 if (processor->current_urgency != old_urgency) {
6835 uint64_t urgency_param1, urgency_param2;
6836
6837 thread_urgency_t new_urgency = thread_get_urgency(thread,
6838 &urgency_param1, &urgency_param2);
6839
6840 thread_tell_urgency(new_urgency, urgency_param1,
6841 urgency_param2, 0, thread);
6842 }
6843
6844 /* TODO: only call this if current_perfctl_class changed */
6845 uint64_t ctime = mach_approximate_time();
6846 machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
6847 } else if (thread->state & TH_RUN) {
6848 processor_t processor = thread->last_processor;
6849
6850 if (!lazy_update &&
6851 processor != PROCESSOR_NULL &&
6852 processor != current_processor() &&
6853 processor->active_thread == thread) {
6854 cause_ast_check(processor);
6855 }
6856 }
6857 }
6858
6859 /*
6860 * thread_run_queue_remove_for_handoff
6861 *
6862 * Pull a thread or its (recursive) push target out of the runqueue
6863 * so that it is ready for thread_run()
6864 *
6865 * Called at splsched
6866 *
6867 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6868 * This may be different than the thread that was passed in.
6869 */
6870 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)6871 thread_run_queue_remove_for_handoff(thread_t thread)
6872 {
6873 thread_t pulled_thread = THREAD_NULL;
6874
6875 thread_lock(thread);
6876
6877 /*
6878 * Check that the thread is not bound to a different processor,
6879 * NO_SMT flag is not set on the thread, cluster type of
6880 * processor matches with thread if the thread is pinned to a
6881 * particular cluster and that realtime is not involved.
6882 *
6883 * Next, pull it off its run queue. If it doesn't come, it's not eligible.
6884 */
6885 processor_t processor = current_processor();
6886 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6887 #if CONFIG_SCHED_SMT
6888 && (!thread_no_smt(thread))
6889 #endif /* CONFIG_SCHED_SMT */
6890 && (processor->current_pri < BASEPRI_RTQUEUES)
6891 && (thread->sched_pri < BASEPRI_RTQUEUES)
6892 #if __AMP__
6893 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6894 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6895 #endif /* __AMP__ */
6896 ) {
6897 if (thread_run_queue_remove(thread)) {
6898 pulled_thread = thread;
6899 }
6900 }
6901
6902 thread_unlock(thread);
6903
6904 return pulled_thread;
6905 }
6906
6907 /*
6908 * thread_prepare_for_handoff
6909 *
6910 * Make the thread ready for handoff.
6911 * If the thread was runnable then pull it off the runq, if the thread could
6912 * not be pulled, return NULL.
6913 *
6914 * If the thread was woken up from wait for handoff, make sure it is not bound to
6915 * different processor.
6916 *
6917 * Called at splsched
6918 *
6919 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6920 * This may be different than the thread that was passed in.
6921 */
6922 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)6923 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6924 {
6925 thread_t pulled_thread = THREAD_NULL;
6926
6927 if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6928 processor_t processor = current_processor();
6929 thread_lock(thread);
6930
6931 /*
6932 * Check that the thread is not bound to a different processor,
6933 * NO_SMT flag is not set on the thread and cluster type of
6934 * processor matches with thread if the thread is pinned to a
6935 * particular cluster. Call setrun instead if above conditions
6936 * are not satisfied.
6937 */
6938 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6939 #if CONFIG_SCHED_SMT
6940 && (!thread_no_smt(thread))
6941 #endif /* CONFIG_SCHED_SMT */
6942 #if __AMP__
6943 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6944 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6945 #endif /* __AMP__ */
6946 ) {
6947 pulled_thread = thread;
6948 } else {
6949 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
6950 }
6951 thread_unlock(thread);
6952 } else {
6953 pulled_thread = thread_run_queue_remove_for_handoff(thread);
6954 }
6955
6956 return pulled_thread;
6957 }
6958
6959 /*
6960 * thread_run_queue_remove:
6961 *
6962 * Remove a thread from its current run queue and
6963 * return TRUE if successful.
6964 *
6965 * Thread must be locked.
6966 *
6967 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6968 * run queues because the caller locked the thread. Otherwise
6969 * the thread is on a run queue, but could be chosen for dispatch
6970 * and removed by another processor under a different lock, which
6971 * will set thread->runq to PROCESSOR_NULL.
6972 *
6973 * Hence the thread select path must not rely on anything that could
6974 * be changed under the thread lock after calling this function,
6975 * most importantly thread->sched_pri.
6976 */
6977 boolean_t
thread_run_queue_remove(thread_t thread)6978 thread_run_queue_remove(
6979 thread_t thread)
6980 {
6981 boolean_t removed = FALSE;
6982
6983 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6984 /* Thread isn't runnable */
6985 thread_assert_runq_null(thread);
6986 return FALSE;
6987 }
6988
6989 processor_t processor = thread_get_runq(thread);
6990 if (processor == PROCESSOR_NULL) {
6991 /*
6992 * The thread is either not on the runq,
6993 * or is in the midst of being removed from the runq.
6994 *
6995 * runq is set to NULL under the pset lock, not the thread
6996 * lock, so the thread may still be in the process of being dequeued
6997 * from the runq. It will wait in invoke for the thread lock to be
6998 * dropped.
6999 */
7000
7001 return FALSE;
7002 }
7003
7004 if (thread->sched_pri < BASEPRI_RTQUEUES) {
7005 return SCHED(processor_queue_remove)(processor, thread);
7006 }
7007
7008 processor_set_t pset = processor->processor_set;
7009
7010 pset_lock(pset);
7011
7012 /*
7013 * Must re-read the thread runq after acquiring the pset lock, in
7014 * case another core swooped in before us to dequeue the thread.
7015 */
7016 if (thread_get_runq_locked(thread) != PROCESSOR_NULL) {
7017 /*
7018 * Thread is on the RT run queue and we have a lock on
7019 * that run queue.
7020 */
7021 rt_runq_remove(SCHED(rt_runq)(pset), thread);
7022 pset_update_rt_stealable_state(pset);
7023
7024 removed = TRUE;
7025 }
7026
7027 pset_unlock(pset);
7028
7029 return removed;
7030 }
7031
7032 /*
7033 * Put the thread back where it goes after a thread_run_queue_remove
7034 *
7035 * Thread must have been removed under the same thread lock hold
7036 *
7037 * thread locked, at splsched
7038 */
7039 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)7040 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
7041 {
7042 thread_assert_runq_null(thread);
7043 assert(thread->state & (TH_RUN));
7044
7045 thread_setrun(thread, options);
7046 }
7047
7048 void
sys_override_cpu_throttle(boolean_t enable_override)7049 sys_override_cpu_throttle(boolean_t enable_override)
7050 {
7051 if (enable_override) {
7052 cpu_throttle_enabled = 0;
7053 } else {
7054 cpu_throttle_enabled = 1;
7055 }
7056 }
7057
7058 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)7059 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
7060 {
7061 uint64_t urgency_param1 = 0, urgency_param2 = 0;
7062 task_t task = get_threadtask_early(thread);
7063
7064 thread_urgency_t urgency;
7065
7066 if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
7067 urgency_param1 = 0;
7068 urgency_param2 = 0;
7069
7070 urgency = THREAD_URGENCY_NONE;
7071 } else if (thread->sched_mode == TH_MODE_REALTIME) {
7072 urgency_param1 = thread->realtime.period;
7073 urgency_param2 = thread->realtime.deadline;
7074
7075 urgency = THREAD_URGENCY_REAL_TIME;
7076 } else if (cpu_throttle_enabled &&
7077 (thread->sched_pri <= MAXPRI_THROTTLE) &&
7078 (thread->base_pri <= MAXPRI_THROTTLE)) {
7079 /*
7080 * Threads that are running at low priority but are not
7081 * tagged with a specific QoS are separated out from
7082 * the "background" urgency. Performance management
7083 * subsystem can decide to either treat these threads
7084 * as normal threads or look at other signals like thermal
7085 * levels for optimal power/perf tradeoffs for a platform.
7086 */
7087 boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
7088 boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
7089
7090 /*
7091 * Background urgency applied when thread priority is
7092 * MAXPRI_THROTTLE or lower and thread is not promoted
7093 * and thread has a QoS specified
7094 */
7095 urgency_param1 = thread->sched_pri;
7096 urgency_param2 = thread->base_pri;
7097
7098 if (thread_lacks_qos && !task_is_suppressed) {
7099 urgency = THREAD_URGENCY_LOWPRI;
7100 } else {
7101 urgency = THREAD_URGENCY_BACKGROUND;
7102 }
7103 } else {
7104 /* For otherwise unclassified threads, report throughput QoS parameters */
7105 urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
7106 urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
7107 urgency = THREAD_URGENCY_NORMAL;
7108 }
7109
7110 if (arg1 != NULL) {
7111 *arg1 = urgency_param1;
7112 }
7113 if (arg2 != NULL) {
7114 *arg2 = urgency_param2;
7115 }
7116
7117 return urgency;
7118 }
7119
7120 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)7121 thread_get_perfcontrol_class(thread_t thread)
7122 {
7123 /* Special case handling */
7124 if (thread->state & TH_IDLE) {
7125 return PERFCONTROL_CLASS_IDLE;
7126 }
7127
7128 if (thread->sched_mode == TH_MODE_REALTIME) {
7129 return PERFCONTROL_CLASS_REALTIME;
7130 }
7131
7132 /* perfcontrol_class based on base_pri */
7133 if (thread->base_pri <= MAXPRI_THROTTLE) {
7134 return PERFCONTROL_CLASS_BACKGROUND;
7135 } else if (thread->base_pri <= BASEPRI_UTILITY) {
7136 return PERFCONTROL_CLASS_UTILITY;
7137 } else if (thread->base_pri <= BASEPRI_DEFAULT) {
7138 return PERFCONTROL_CLASS_NONUI;
7139 } else if (thread->base_pri <= BASEPRI_USER_INITIATED) {
7140 return PERFCONTROL_CLASS_USER_INITIATED;
7141 } else if (thread->base_pri <= BASEPRI_FOREGROUND) {
7142 return PERFCONTROL_CLASS_UI;
7143 } else {
7144 if (get_threadtask(thread) == kernel_task) {
7145 /*
7146 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
7147 * All other lower priority kernel threads should be treated
7148 * as regular threads for performance control purposes.
7149 */
7150 return PERFCONTROL_CLASS_KERNEL;
7151 }
7152 return PERFCONTROL_CLASS_ABOVEUI;
7153 }
7154 }
7155
7156 /*
7157 * This is the processor idle loop, which just looks for other threads
7158 * to execute. Processor idle threads invoke this without supplying a
7159 * current thread to idle without an asserted wait state.
7160 *
7161 * Returns a the next thread to execute if dispatched directly.
7162 */
7163
7164 #if 0
7165 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
7166 #else
7167 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
7168 #endif
7169
7170 #if (DEVELOPMENT || DEBUG)
7171 int sched_idle_delay_cpuid = -1;
7172 #endif
7173
7174 thread_t
processor_idle(thread_t thread,processor_t processor)7175 processor_idle(
7176 thread_t thread,
7177 processor_t processor)
7178 {
7179 processor_set_t pset = processor->processor_set;
7180 struct recount_snap snap = { 0 };
7181
7182 (void)splsched();
7183
7184 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7185 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
7186 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
7187
7188 SCHED_STATS_INC(idle_transitions);
7189 assert(processor->running_timers_active == false);
7190
7191 recount_snapshot(&snap);
7192 recount_processor_idle(&processor->pr_recount, &snap);
7193
7194 while (1) {
7195 /*
7196 * Ensure that updates to my processor and pset state,
7197 * made by the IPI source processor before sending the IPI,
7198 * are visible on this processor now (even though we don't
7199 * take the pset lock yet).
7200 */
7201 atomic_thread_fence(memory_order_acquire);
7202
7203 if (processor->state != PROCESSOR_IDLE) {
7204 break;
7205 }
7206 if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
7207 break;
7208 }
7209 #if defined(CONFIG_SCHED_DEFERRED_AST)
7210 if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
7211 break;
7212 }
7213 #endif
7214 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
7215 break;
7216 }
7217
7218 if (
7219 processor->is_recommended
7220 #if CONFIG_SCHED_SMT
7221 && (processor->processor_primary == processor)
7222 #endif /* CONFIG_SCHED_SMT */
7223 ) {
7224 if (rt_runq_count(pset)) {
7225 break;
7226 }
7227 } else {
7228 if (SCHED(processor_bound_count)(processor)) {
7229 break;
7230 }
7231 }
7232
7233 IDLE_KERNEL_DEBUG_CONSTANT(
7234 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
7235
7236 machine_track_platform_idle(TRUE);
7237
7238 machine_idle();
7239 /* returns with interrupts enabled */
7240
7241 machine_track_platform_idle(FALSE);
7242
7243 #if (DEVELOPMENT || DEBUG)
7244 if (processor->cpu_id == sched_idle_delay_cpuid) {
7245 delay(500);
7246 }
7247 #endif
7248
7249 (void)splsched();
7250
7251 atomic_thread_fence(memory_order_acquire);
7252
7253 IDLE_KERNEL_DEBUG_CONSTANT(
7254 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
7255
7256 /*
7257 * Check if we should call sched_timeshare_consider_maintenance() here.
7258 * The CPU was woken out of idle due to an interrupt and we should do the
7259 * call only if the processor is still idle. If the processor is non-idle,
7260 * the threads running on the processor would do the call as part of
7261 * context swithing.
7262 */
7263 if (processor->state == PROCESSOR_IDLE) {
7264 sched_timeshare_consider_maintenance(mach_absolute_time(), true);
7265 }
7266
7267 if (!SCHED(processor_queue_empty)(processor)) {
7268 #if CONFIG_SCHED_SMT
7269 /* Secondary SMT processors respond to directed wakeups
7270 * exclusively. Some platforms induce 'spurious' SMT wakeups.
7271 */
7272 if (processor->processor_primary == processor) {
7273 break;
7274 }
7275 #else /* CONFIG_SCHED_SMT*/
7276 break;
7277 #endif /* CONFIG_SCHED_SMT*/
7278 }
7279 }
7280
7281 recount_snapshot(&snap);
7282 recount_processor_run(&processor->pr_recount, &snap);
7283 smr_cpu_join(processor, snap.rsn_time_mach);
7284
7285 ast_t reason = AST_NONE;
7286
7287 /* We're handling all scheduling AST's */
7288 ast_off(AST_SCHEDULING);
7289
7290 /*
7291 * thread_select will move the processor from dispatching to running,
7292 * or put it in idle if there's nothing to do.
7293 */
7294 thread_t cur_thread = current_thread();
7295
7296 thread_lock(cur_thread);
7297 thread_t new_thread = thread_select(cur_thread, processor, &reason);
7298 thread_unlock(cur_thread);
7299
7300 assert(processor->running_timers_active == false);
7301
7302 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7303 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
7304 (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
7305
7306 return new_thread;
7307 }
7308
7309 /*
7310 * Each processor has a dedicated thread which
7311 * executes the idle loop when there is no suitable
7312 * previous context.
7313 *
7314 * This continuation is entered with interrupts disabled.
7315 */
7316 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)7317 idle_thread(__assert_only void* parameter,
7318 __unused wait_result_t result)
7319 {
7320 assert(ml_get_interrupts_enabled() == FALSE);
7321 assert(parameter == NULL);
7322
7323 processor_t processor = current_processor();
7324
7325 smr_cpu_leave(processor, processor->last_dispatch);
7326
7327 /*
7328 * Ensure that anything running in idle context triggers
7329 * preemption-disabled checks.
7330 */
7331 disable_preemption_without_measurements();
7332
7333 /*
7334 * Enable interrupts temporarily to handle any pending interrupts
7335 * or IPIs before deciding to sleep
7336 */
7337 spllo();
7338
7339 thread_t new_thread = processor_idle(THREAD_NULL, processor);
7340 /* returns with interrupts disabled */
7341
7342 enable_preemption();
7343
7344 if (new_thread != THREAD_NULL) {
7345 thread_run(processor->idle_thread,
7346 idle_thread, NULL, new_thread);
7347 /*NOTREACHED*/
7348 }
7349
7350 thread_block(idle_thread);
7351 /*NOTREACHED*/
7352 }
7353
7354 void
idle_thread_create(processor_t processor,thread_continue_t continuation)7355 idle_thread_create(
7356 processor_t processor,
7357 thread_continue_t continuation)
7358 {
7359 kern_return_t result;
7360 thread_t thread;
7361 spl_t s;
7362 char name[MAXTHREADNAMESIZE];
7363
7364 result = kernel_thread_create(continuation, NULL, MAXPRI_KERNEL, &thread);
7365 if (result != KERN_SUCCESS) {
7366 panic("idle_thread_create failed: %d", result);
7367 }
7368
7369 snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
7370 thread_set_thread_name(thread, name);
7371
7372 s = splsched();
7373 thread_lock(thread);
7374 thread->bound_processor = processor;
7375 thread->chosen_processor = processor;
7376 processor->idle_thread = thread;
7377 thread->sched_pri = thread->base_pri = IDLEPRI;
7378 thread->state = (TH_RUN | TH_IDLE);
7379 thread->options |= TH_OPT_IDLE_THREAD;
7380 thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
7381 thread_unlock(thread);
7382 splx(s);
7383
7384 thread_deallocate(thread);
7385 }
7386
7387 /*
7388 * sched_startup:
7389 *
7390 * Kicks off scheduler services.
7391 *
7392 * Called at splsched.
7393 */
7394 void
sched_startup(void)7395 sched_startup(void)
7396 {
7397 kern_return_t result;
7398 thread_t thread;
7399
7400 simple_lock_init(&sched_vm_group_list_lock, 0);
7401
7402 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
7403 NULL, MAXPRI_KERNEL, &thread);
7404 if (result != KERN_SUCCESS) {
7405 panic("sched_startup");
7406 }
7407
7408 thread_deallocate(thread);
7409
7410 assert_thread_magic(thread);
7411
7412 /*
7413 * Yield to the sched_init_thread once, to
7414 * initialize our own thread after being switched
7415 * back to.
7416 *
7417 * The current thread is the only other thread
7418 * active at this point.
7419 */
7420 thread_block(THREAD_CONTINUE_NULL);
7421
7422 assert_thread_magic(thread);
7423 }
7424
7425 #if __arm64__
7426 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
7427 #endif /* __arm64__ */
7428
7429
7430 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
7431
7432 static volatile uint64_t sched_maintenance_deadline;
7433 static uint64_t sched_tick_last_abstime;
7434 static uint64_t sched_tick_delta;
7435 uint64_t sched_tick_max_delta;
7436
7437
7438 /*
7439 * sched_init_thread:
7440 *
7441 * Perform periodic bookkeeping functions about ten
7442 * times per second.
7443 */
7444 void
sched_timeshare_maintenance_continue(void)7445 sched_timeshare_maintenance_continue(void)
7446 {
7447 uint64_t sched_tick_ctime, late_time;
7448
7449 struct sched_update_scan_context scan_context = {
7450 .earliest_bg_make_runnable_time = UINT64_MAX,
7451 .earliest_normal_make_runnable_time = UINT64_MAX,
7452 .earliest_rt_make_runnable_time = UINT64_MAX
7453 };
7454
7455 sched_tick_ctime = mach_absolute_time();
7456
7457 if (__improbable(sched_tick_last_abstime == 0)) {
7458 sched_tick_last_abstime = sched_tick_ctime;
7459 late_time = 0;
7460 sched_tick_delta = 1;
7461 } else {
7462 late_time = sched_tick_ctime - sched_tick_last_abstime;
7463 sched_tick_delta = late_time / sched_tick_interval;
7464 /* Ensure a delta of 1, since the interval could be slightly
7465 * smaller than the sched_tick_interval due to dispatch
7466 * latencies.
7467 */
7468 sched_tick_delta = MAX(sched_tick_delta, 1);
7469
7470 /* In the event interrupt latencies or platform
7471 * idle events that advanced the timebase resulted
7472 * in periods where no threads were dispatched,
7473 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
7474 * iterations.
7475 */
7476 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
7477
7478 sched_tick_last_abstime = sched_tick_ctime;
7479 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
7480 }
7481
7482 scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
7483 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
7484 sched_tick_delta, late_time, 0, 0, 0);
7485
7486 /* Add a number of pseudo-ticks corresponding to the elapsed interval
7487 * This could be greater than 1 if substantial intervals where
7488 * all processors are idle occur, which rarely occurs in practice.
7489 */
7490
7491 sched_tick += sched_tick_delta;
7492
7493 update_vm_info();
7494
7495 /*
7496 * Compute various averages.
7497 */
7498 compute_averages(sched_tick_delta);
7499
7500 /*
7501 * Scan the run queues for threads which
7502 * may need to be updated, and find the earliest runnable thread on the runqueue
7503 * to report its latency.
7504 */
7505 SCHED(thread_update_scan)(&scan_context);
7506
7507 SCHED(rt_runq_scan)(&scan_context);
7508
7509 uint64_t ctime = mach_absolute_time();
7510
7511 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
7512 ctime - scan_context.earliest_bg_make_runnable_time : 0;
7513
7514 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
7515 ctime - scan_context.earliest_normal_make_runnable_time : 0;
7516
7517 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
7518 ctime - scan_context.earliest_rt_make_runnable_time : 0;
7519
7520 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
7521
7522 /*
7523 * Check to see if the special sched VM group needs attention.
7524 */
7525 sched_vm_group_maintenance();
7526
7527 #if __arm64__
7528 /* Check to see if the recommended cores failsafe is active */
7529 sched_recommended_cores_maintenance();
7530 #endif /* __arm64__ */
7531
7532
7533 #if DEBUG || DEVELOPMENT
7534 #if __x86_64__
7535 #include <i386/misc_protos.h>
7536 /* Check for long-duration interrupts */
7537 mp_interrupt_watchdog();
7538 #endif /* __x86_64__ */
7539 #endif /* DEBUG || DEVELOPMENT */
7540
7541 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
7542 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
7543 sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
7544
7545 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
7546 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
7547 /*NOTREACHED*/
7548 }
7549
7550 static uint64_t sched_maintenance_wakeups;
7551
7552 /*
7553 * Determine if the set of routines formerly driven by a maintenance timer
7554 * must be invoked, based on a deadline comparison. Signals the scheduler
7555 * maintenance thread on deadline expiration. Must be invoked at an interval
7556 * lower than the "sched_tick_interval", currently accomplished by
7557 * invocation via the quantum expiration timer and at context switch time.
7558 * Performance matters: this routine reuses a timestamp approximating the
7559 * current absolute time received from the caller, and should perform
7560 * no more than a comparison against the deadline in the common case.
7561 */
7562 void
sched_timeshare_consider_maintenance(uint64_t ctime,bool safe_point)7563 sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point)
7564 {
7565 uint64_t deadline = sched_maintenance_deadline;
7566
7567 if (__improbable(ctime >= deadline)) {
7568 if (__improbable(current_thread() == sched_maintenance_thread)) {
7569 return;
7570 }
7571 OSMemoryBarrier();
7572
7573 uint64_t ndeadline = ctime + sched_tick_interval;
7574
7575 if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
7576 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
7577 sched_maintenance_wakeups++;
7578 smr_maintenance(ctime);
7579 }
7580 }
7581
7582 smr_cpu_tick(ctime, safe_point);
7583
7584 #if !CONFIG_SCHED_CLUTCH
7585 /*
7586 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
7587 * scheduler, the load is maintained at the thread group and bucket level.
7588 */
7589 uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
7590
7591 if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
7592 uint64_t new_deadline = 0;
7593 if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
7594 compute_sched_load();
7595 new_deadline = ctime + sched_load_compute_interval_abs;
7596 os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
7597 }
7598 }
7599 #endif /* CONFIG_SCHED_CLUTCH */
7600
7601 #if __arm64__
7602 uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
7603
7604 if (__improbable(perf_deadline && ctime >= perf_deadline)) {
7605 /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
7606 if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
7607 machine_perfcontrol_deadline_passed(perf_deadline);
7608 }
7609 }
7610 #endif /* __arm64__ */
7611 }
7612
7613 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7614
7615 void
sched_init_thread(void)7616 sched_init_thread(void)
7617 {
7618 thread_block(THREAD_CONTINUE_NULL);
7619
7620 thread_t thread = current_thread();
7621
7622 thread_set_thread_name(thread, "sched_maintenance_thread");
7623
7624 sched_maintenance_thread = thread;
7625
7626 SCHED(maintenance_continuation)();
7627
7628 /*NOTREACHED*/
7629 }
7630
7631 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
7632
7633 /*
7634 * thread_update_scan / runq_scan:
7635 *
7636 * Scan the run queues to account for timesharing threads
7637 * which need to be updated.
7638 *
7639 * Scanner runs in two passes. Pass one squirrels likely
7640 * threads away in an array, pass two does the update.
7641 *
7642 * This is necessary because the run queue is locked for
7643 * the candidate scan, but the thread is locked for the update.
7644 *
7645 * Array should be sized to make forward progress, without
7646 * disabling preemption for long periods.
7647 */
7648
7649 #define THREAD_UPDATE_SIZE 128
7650
7651 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
7652 static uint32_t thread_update_count = 0;
7653
7654 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
7655 boolean_t
thread_update_add_thread(thread_t thread)7656 thread_update_add_thread(thread_t thread)
7657 {
7658 if (thread_update_count == THREAD_UPDATE_SIZE) {
7659 return FALSE;
7660 }
7661
7662 thread_update_array[thread_update_count++] = thread;
7663 thread_reference(thread);
7664 return TRUE;
7665 }
7666
7667 /* Returns whether the kernel should report that a thread triggered the fail-safe. */
7668 static bool
thread_should_report_failsafe(thread_t thread)7669 thread_should_report_failsafe(thread_t thread)
7670 {
7671 if ((thread->sched_flags & TH_SFLAG_FAILSAFE) && !(thread->sched_flags & TH_SFLAG_FAILSAFE_REPORTED)) {
7672 /* disarm the trigger for subsequent invocations */
7673 thread->sched_flags |= TH_SFLAG_FAILSAFE_REPORTED;
7674 return true;
7675 }
7676 return false;
7677 }
7678
7679 void
thread_update_process_threads(void)7680 thread_update_process_threads(void)
7681 {
7682 assert(thread_update_count <= THREAD_UPDATE_SIZE);
7683
7684 for (uint32_t i = 0; i < thread_update_count; i++) {
7685 thread_t thread = thread_update_array[i];
7686 assert_thread_magic(thread);
7687 thread_update_array[i] = THREAD_NULL;
7688
7689 spl_t s = splsched();
7690 thread_lock(thread);
7691
7692 const bool should_report_failsafe = thread_should_report_failsafe(thread);
7693 const sched_mode_t saved_mode = thread->saved_mode; // if reporting
7694
7695 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
7696 SCHED(update_priority)(thread);
7697 }
7698 thread_unlock(thread);
7699 splx(s);
7700
7701 /* now that interrupts are enabled, it is safe to report fail-safe triggers */
7702 if (should_report_failsafe) {
7703 assert((saved_mode & TH_MODE_REALTIME) || (saved_mode & TH_MODE_FIXED));
7704 uint64_t th_id = thread->thread_id;
7705 char th_name[MAXTHREADNAMESIZE] = "unknown";
7706 if (thread_has_thread_name(thread)) {
7707 thread_get_thread_name(thread, th_name);
7708 }
7709 task_t task = get_threadtask(thread);
7710 assert(task != NULL);
7711 const char* t_name = task_best_name(task);
7712 pid_t t_pid = task_pid(task);
7713 const int quanta = (saved_mode & TH_MODE_REALTIME) ? max_unsafe_rt_quanta : max_unsafe_fixed_quanta;
7714 const char* mode = (saved_mode & TH_MODE_REALTIME) ? "realtime" : "fixed";
7715 os_log_error(OS_LOG_DEFAULT, "scheduler: thread %s [%llx] in "
7716 "process %s [%d] triggered fail-safe by spinning for at least %d"
7717 "us at %s priority\n",
7718 th_name,
7719 th_id,
7720 t_name,
7721 t_pid,
7722 quanta * (int) sched_get_quantum_us(),
7723 mode);
7724 }
7725
7726 thread_deallocate(thread);
7727 }
7728
7729 thread_update_count = 0;
7730 }
7731
7732 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)7733 runq_scan_thread(
7734 thread_t thread,
7735 sched_update_scan_context_t scan_context)
7736 {
7737 assert_thread_magic(thread);
7738
7739 if (thread->sched_stamp != sched_tick &&
7740 thread->sched_mode == TH_MODE_TIMESHARE) {
7741 if (thread_update_add_thread(thread) == FALSE) {
7742 return TRUE;
7743 }
7744 }
7745
7746 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
7747 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
7748 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
7749 }
7750 } else {
7751 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
7752 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
7753 }
7754 }
7755
7756 return FALSE;
7757 }
7758
7759 /*
7760 * Scan a runq for candidate threads.
7761 *
7762 * Returns TRUE if retry is needed.
7763 */
7764 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)7765 runq_scan(
7766 run_queue_t runq,
7767 sched_update_scan_context_t scan_context)
7768 {
7769 int count = runq->count;
7770 int queue_index;
7771
7772 assert(count >= 0);
7773
7774 if (count == 0) {
7775 return FALSE;
7776 }
7777
7778 for (queue_index = bitmap_first(runq->bitmap, NRQS);
7779 queue_index >= 0;
7780 queue_index = bitmap_next(runq->bitmap, queue_index)) {
7781 thread_t thread;
7782 circle_queue_t queue = &runq->queues[queue_index];
7783
7784 cqe_foreach_element(thread, queue, runq_links) {
7785 assert(count > 0);
7786 if (runq_scan_thread(thread, scan_context) == TRUE) {
7787 return TRUE;
7788 }
7789 count--;
7790 }
7791 }
7792
7793 return FALSE;
7794 }
7795
7796 #if CONFIG_SCHED_CLUTCH
7797
7798 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)7799 sched_clutch_timeshare_scan(
7800 queue_t thread_queue,
7801 uint16_t thread_count,
7802 sched_update_scan_context_t scan_context)
7803 {
7804 if (thread_count == 0) {
7805 return FALSE;
7806 }
7807
7808 thread_t thread;
7809 qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
7810 if (runq_scan_thread(thread, scan_context) == TRUE) {
7811 return TRUE;
7812 }
7813 thread_count--;
7814 }
7815
7816 assert(thread_count == 0);
7817 return FALSE;
7818 }
7819
7820
7821 #endif /* CONFIG_SCHED_CLUTCH */
7822
7823 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7824
7825 bool
thread_is_eager_preempt(thread_t thread)7826 thread_is_eager_preempt(thread_t thread)
7827 {
7828 return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
7829 }
7830
7831 void
thread_set_eager_preempt(thread_t thread)7832 thread_set_eager_preempt(thread_t thread)
7833 {
7834 spl_t s = splsched();
7835 thread_lock(thread);
7836
7837 assert(!thread_is_eager_preempt(thread));
7838
7839 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
7840
7841 if (thread == current_thread()) {
7842 /* csw_check updates current_is_eagerpreempt on the processor */
7843 ast_t ast = csw_check(thread, current_processor(), AST_NONE);
7844
7845 thread_unlock(thread);
7846
7847 if (ast != AST_NONE) {
7848 thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
7849 }
7850 } else {
7851 processor_t last_processor = thread->last_processor;
7852
7853 if (last_processor != PROCESSOR_NULL &&
7854 last_processor->state == PROCESSOR_RUNNING &&
7855 last_processor->active_thread == thread) {
7856 cause_ast_check(last_processor);
7857 }
7858
7859 thread_unlock(thread);
7860 }
7861
7862 splx(s);
7863 }
7864
7865 void
thread_clear_eager_preempt(thread_t thread)7866 thread_clear_eager_preempt(thread_t thread)
7867 {
7868 spl_t s = splsched();
7869 thread_lock(thread);
7870
7871 assert(thread_is_eager_preempt(thread));
7872
7873 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
7874
7875 if (thread == current_thread()) {
7876 current_processor()->current_is_eagerpreempt = false;
7877 }
7878
7879 thread_unlock(thread);
7880 splx(s);
7881 }
7882
7883 /*
7884 * Scheduling statistics
7885 */
7886 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)7887 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
7888 {
7889 struct sched_statistics *stats;
7890 boolean_t to_realtime = FALSE;
7891
7892 stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
7893 stats->csw_count++;
7894
7895 if (otherpri >= BASEPRI_REALTIME) {
7896 stats->rt_sched_count++;
7897 to_realtime = TRUE;
7898 }
7899
7900 if ((reasons & AST_PREEMPT) != 0) {
7901 stats->preempt_count++;
7902
7903 if (selfpri >= BASEPRI_REALTIME) {
7904 stats->preempted_rt_count++;
7905 }
7906
7907 if (to_realtime) {
7908 stats->preempted_by_rt_count++;
7909 }
7910 }
7911 }
7912
7913 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)7914 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
7915 {
7916 uint64_t timestamp = mach_absolute_time();
7917
7918 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
7919 stats->last_change_timestamp = timestamp;
7920 }
7921
7922 /*
7923 * For calls from assembly code
7924 */
7925 #undef thread_wakeup
7926 void
7927 thread_wakeup(
7928 event_t x);
7929
7930 void
thread_wakeup(event_t x)7931 thread_wakeup(
7932 event_t x)
7933 {
7934 thread_wakeup_with_result(x, THREAD_AWAKENED);
7935 }
7936
7937 boolean_t
preemption_enabled(void)7938 preemption_enabled(void)
7939 {
7940 return get_preemption_level() == 0 && ml_get_interrupts_enabled();
7941 }
7942
7943 static void
sched_timer_deadline_tracking_init(void)7944 sched_timer_deadline_tracking_init(void)
7945 {
7946 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
7947 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
7948 }
7949
7950 /*
7951 * Check that all CPUs are successfully powered up in places where that's expected.
7952 */
7953 static void
check_all_cpus_are_done_starting(processor_start_kind_t start_kind)7954 check_all_cpus_are_done_starting(processor_start_kind_t start_kind)
7955 {
7956 /*
7957 * `processor_count` may include registered CPUs above cpus= or cpumask= limit.
7958 * Use machine_info.logical_cpu_max for the CPU IDs that matter.
7959 */
7960 for (int cpu_id = 0; cpu_id < machine_info.logical_cpu_max; cpu_id++) {
7961 processor_t processor = processor_array[cpu_id];
7962 processor_wait_for_start(processor, start_kind);
7963 }
7964 }
7965
7966 /*
7967 * Find some available online CPU that threads can be enqueued on
7968 *
7969 * Called with the sched_available_cores_lock held
7970 */
7971 static int
sched_last_resort_cpu(void)7972 sched_last_resort_cpu(void)
7973 {
7974 simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
7975
7976 int last_resort_cpu = lsb_first(pcs.pcs_effective.pcs_online_cores);
7977
7978 if (last_resort_cpu == -1) {
7979 panic("no last resort cpu found!");
7980 }
7981
7982 return last_resort_cpu;
7983 }
7984
7985
7986 static void
assert_no_processors_in_transition_locked()7987 assert_no_processors_in_transition_locked()
7988 {
7989 assert(pcs.pcs_in_kernel_sleep == false);
7990
7991 /* All processors must be either running or offline */
7992 assert(pcs.pcs_managed_cores ==
7993 (processor_offline_state_map[PROCESSOR_OFFLINE_RUNNING] |
7994 processor_offline_state_map[PROCESSOR_OFFLINE_FULLY_OFFLINE]));
7995
7996 /* All state transitions must be quiesced at this point */
7997 assert(pcs.pcs_effective.pcs_online_cores ==
7998 processor_offline_state_map[PROCESSOR_OFFLINE_RUNNING]);
7999 }
8000
8001 static struct powered_cores_state
sched_compute_requested_powered_cores()8002 sched_compute_requested_powered_cores()
8003 {
8004 simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
8005
8006 struct powered_cores_state output = {
8007 .pcs_online_cores = pcs.pcs_managed_cores,
8008 .pcs_powerdown_recommended_cores = pcs.pcs_managed_cores,
8009 .pcs_tempdown_cores = 0,
8010 };
8011
8012 if (!pcs.pcs_init_completed) {
8013 return output;
8014 }
8015
8016 /*
8017 * if we unify this with derecommendation, note that only sleep should stop derecommendation,
8018 * not dtrace et al
8019 */
8020 if (pcs.pcs_powerdown_suspend_count) {
8021 return output;
8022 } else {
8023 /*
8024 * The cores power clients like ANE require or
8025 * the kernel cannot offline
8026 */
8027 cpumap_t system_required_powered_cores = pcs.pcs_required_online_pmgr |
8028 pcs.pcs_required_online_system;
8029
8030 cpumap_t online_cores_goal;
8031
8032 if (pcs.pcs_user_online_core_control) {
8033 /* This is our new goal state for powered cores */
8034 output.pcs_powerdown_recommended_cores = pcs.pcs_requested_online_user;
8035 online_cores_goal = pcs.pcs_requested_online_user | system_required_powered_cores;
8036 } else {
8037 /* Remove the cores CLPC wants to power down */
8038 cpumap_t clpc_wanted_powered_cores = pcs.pcs_managed_cores;
8039 clpc_wanted_powered_cores &= pcs.pcs_requested_online_clpc_user;
8040 clpc_wanted_powered_cores &= pcs.pcs_requested_online_clpc_system;
8041
8042 output.pcs_powerdown_recommended_cores = clpc_wanted_powered_cores;
8043 online_cores_goal = clpc_wanted_powered_cores | system_required_powered_cores;
8044
8045 /* Any cores in managed cores that are not in wanted powered become temporary */
8046 output.pcs_tempdown_cores = (pcs.pcs_managed_cores & ~clpc_wanted_powered_cores);
8047
8048 /* Future: Treat CLPC user/system separately. */
8049 }
8050
8051 if (online_cores_goal == 0) {
8052 /*
8053 * If we're somehow trying to disable all CPUs,
8054 * force online the lowest numbered CPU.
8055 */
8056 online_cores_goal = BIT(lsb_first(pcs.pcs_managed_cores));
8057 }
8058
8059 #if RHODES_CLUSTER_POWERDOWN_WORKAROUND
8060 /*
8061 * Because warm CPU boot from WFI is not currently implemented,
8062 * we cannot power down only one CPU in a cluster, so we force up
8063 * all the CPUs in the cluster if any one CPU is up in the cluster.
8064 * Once all CPUs are disabled, then the whole cluster goes down at once.
8065 */
8066
8067 cpumap_t workaround_online_cores = 0;
8068
8069 const ml_topology_info_t* topology = ml_get_topology_info();
8070 for (unsigned int i = 0; i < topology->num_clusters; i++) {
8071 ml_topology_cluster_t* cluster = &topology->clusters[i];
8072 if ((cluster->cpu_mask & online_cores_goal) != 0) {
8073 workaround_online_cores |= cluster->cpu_mask;
8074 }
8075 }
8076
8077 online_cores_goal = workaround_online_cores;
8078 #endif /* RHODES_CLUSTER_POWERDOWN_WORKAROUND */
8079
8080 output.pcs_online_cores = online_cores_goal;
8081 }
8082
8083 return output;
8084 }
8085
8086 static bool
sched_needs_update_requested_powered_cores()8087 sched_needs_update_requested_powered_cores()
8088 {
8089 if (!pcs.pcs_init_completed) {
8090 return false;
8091 }
8092
8093 struct powered_cores_state requested = sched_compute_requested_powered_cores();
8094
8095 struct powered_cores_state effective = pcs.pcs_effective;
8096
8097 if (requested.pcs_powerdown_recommended_cores != effective.pcs_powerdown_recommended_cores ||
8098 requested.pcs_online_cores != effective.pcs_online_cores ||
8099 requested.pcs_tempdown_cores != effective.pcs_tempdown_cores) {
8100 return true;
8101 } else {
8102 return false;
8103 }
8104 }
8105
8106 kern_return_t
sched_processor_exit_user(processor_t processor)8107 sched_processor_exit_user(processor_t processor)
8108 {
8109 assert(processor);
8110
8111 lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
8112 assert(preemption_enabled());
8113
8114 kern_return_t result;
8115
8116 spl_t s = splsched();
8117 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8118
8119 if (!enable_processor_exit) {
8120 /* This API is not supported on this device. */
8121 result = KERN_NOT_SUPPORTED;
8122 goto unlock;
8123 }
8124
8125 if (bit_test(pcs.pcs_required_online_system, processor->cpu_id)) {
8126 /* This CPU can never change state outside of sleep. */
8127 result = KERN_NOT_SUPPORTED;
8128 goto unlock;
8129 }
8130
8131 /*
8132 * Future: Instead of failing, simulate the processor
8133 * being shut down via derecommendation and decrementing active count.
8134 */
8135 if (bit_test(pcs.pcs_required_online_pmgr, processor->cpu_id)) {
8136 /* PMGR won't let us power down this CPU right now. */
8137 result = KERN_FAILURE;
8138 goto unlock;
8139 }
8140
8141 if (pcs.pcs_powerdown_suspend_count) {
8142 /* A tool that disables CPU powerdown is active. */
8143 result = KERN_FAILURE;
8144 goto unlock;
8145 }
8146
8147 if (!bit_test(pcs.pcs_requested_online_user, processor->cpu_id)) {
8148 /* The CPU is already powered off by userspace. */
8149 result = KERN_NODE_DOWN;
8150 goto unlock;
8151 }
8152
8153 if ((pcs.pcs_recommended_cores & pcs.pcs_effective.pcs_online_cores) == BIT(processor->cpu_id)) {
8154 /* This is the last available core, can't shut it down. */
8155 result = KERN_RESOURCE_SHORTAGE;
8156 goto unlock;
8157 }
8158
8159 result = KERN_SUCCESS;
8160
8161 if (!pcs.pcs_user_online_core_control) {
8162 pcs.pcs_user_online_core_control = true;
8163 }
8164
8165 bit_clear(pcs.pcs_requested_online_user, processor->cpu_id);
8166
8167 if (sched_needs_update_requested_powered_cores()) {
8168 sched_update_powered_cores_drops_lock(REASON_USER, s);
8169 }
8170
8171 unlock:
8172 simple_unlock(&sched_available_cores_lock);
8173 splx(s);
8174
8175 return result;
8176 }
8177
8178 kern_return_t
sched_processor_start_user(processor_t processor)8179 sched_processor_start_user(processor_t processor)
8180 {
8181 assert(processor);
8182
8183 lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
8184 assert(preemption_enabled());
8185
8186 kern_return_t result;
8187
8188 spl_t s = splsched();
8189 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8190
8191 if (!enable_processor_exit) {
8192 result = KERN_NOT_SUPPORTED;
8193 goto unlock;
8194 }
8195
8196 if (bit_test(pcs.pcs_required_online_system, processor->cpu_id)) {
8197 result = KERN_NOT_SUPPORTED;
8198 goto unlock;
8199 }
8200
8201 #if CONFIG_SCHED_SMT
8202 /* Not allowed to start an SMT processor while SMT is disabled */
8203 if ((sched_enable_smt == 0) && (processor->processor_primary != processor)) {
8204 result = KERN_FAILURE;
8205 goto unlock;
8206 }
8207 #endif /* CONFIG_SCHED_SMT */
8208
8209 if (pcs.pcs_powerdown_suspend_count) {
8210 result = KERN_FAILURE;
8211 goto unlock;
8212 }
8213
8214 if (bit_test(pcs.pcs_requested_online_user, processor->cpu_id)) {
8215 result = KERN_FAILURE;
8216 goto unlock;
8217 }
8218
8219 result = KERN_SUCCESS;
8220
8221 bit_set(pcs.pcs_requested_online_user, processor->cpu_id);
8222
8223 /*
8224 * Once the user puts all CPUs back online,
8225 * we can resume automatic cluster power down.
8226 */
8227 if (pcs.pcs_requested_online_user == pcs.pcs_managed_cores) {
8228 pcs.pcs_user_online_core_control = false;
8229 }
8230
8231 if (sched_needs_update_requested_powered_cores()) {
8232 sched_update_powered_cores_drops_lock(REASON_USER, s);
8233 }
8234
8235 unlock:
8236 simple_unlock(&sched_available_cores_lock);
8237 splx(s);
8238
8239 return result;
8240 }
8241
8242 sched_cond_atomic_t sched_update_powered_cores_wakeup;
8243 thread_t sched_update_powered_cores_thread;
8244
8245
8246 static void OS_NORETURN sched_update_powered_cores_continue(void *param __unused, wait_result_t wr __unused);
8247
8248 /*
8249 * After all processors have been ml_processor_register'ed and processor_boot'ed
8250 * the scheduler can finalize its datastructures and allow CPU power state changes.
8251 *
8252 * Enforce that this only happens *once*. More than once is definitely not OK. rdar://121270513
8253 */
8254 void
sched_cpu_init_completed(void)8255 sched_cpu_init_completed(void)
8256 {
8257 static bool sched_cpu_init_completed_called = false;
8258
8259 if (!os_atomic_cmpxchg(&sched_cpu_init_completed_called, false, true, relaxed)) {
8260 panic("sched_cpu_init_completed called twice! %d", sched_cpu_init_completed_called);
8261 }
8262
8263 if (SCHED(cpu_init_completed) != NULL) {
8264 SCHED(cpu_init_completed)();
8265 }
8266
8267 /* Wait for any cpu that is still starting, and enforce that they eventually complete. */
8268 check_all_cpus_are_done_starting(PROCESSOR_FIRST_BOOT);
8269
8270 lck_mtx_lock(&cluster_powerdown_lock);
8271
8272 assert(sched_update_powered_cores_thread == THREAD_NULL);
8273
8274 sched_cond_init(&sched_update_powered_cores_wakeup);
8275
8276 kern_return_t result = kernel_thread_start_priority(
8277 sched_update_powered_cores_continue,
8278 NULL, MAXPRI_KERNEL, &sched_update_powered_cores_thread);
8279 if (result != KERN_SUCCESS) {
8280 panic("failed to create sched_update_powered_cores thread");
8281 }
8282
8283 thread_set_thread_name(sched_update_powered_cores_thread,
8284 "sched_update_powered_cores");
8285
8286 spl_t s = splsched();
8287 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8288
8289 assert(pcs.pcs_init_completed == false);
8290
8291 pcs.pcs_managed_cores = pcs.pcs_effective.pcs_online_cores;
8292
8293 assert(__builtin_popcountll(pcs.pcs_managed_cores) == machine_info.logical_cpu_max);
8294
8295 /* If CLPC tries to cluster power down before this point, it's ignored. */
8296 pcs.pcs_requested_online_user = pcs.pcs_managed_cores;
8297 pcs.pcs_requested_online_clpc_system = pcs.pcs_managed_cores;
8298 pcs.pcs_requested_online_clpc_user = pcs.pcs_managed_cores;
8299
8300 cpumap_t system_required_cores = 0;
8301
8302 /*
8303 * Ask the platform layer which CPUs are allowed to
8304 * be powered off outside of system sleep.
8305 */
8306 for (int cpu_id = 0; cpu_id < machine_info.logical_cpu_max; cpu_id++) {
8307 if (!ml_cpu_can_exit(cpu_id)) {
8308 bit_set(system_required_cores, cpu_id);
8309 }
8310 }
8311
8312 pcs.pcs_required_online_system = system_required_cores;
8313 pcs.pcs_effective.pcs_powerdown_recommended_cores = pcs.pcs_managed_cores;
8314
8315 pcs.pcs_requested = sched_compute_requested_powered_cores();
8316
8317 assert(pcs.pcs_requested.pcs_powerdown_recommended_cores == pcs.pcs_managed_cores);
8318 assert(pcs.pcs_requested.pcs_online_cores == pcs.pcs_managed_cores);
8319 assert(pcs.pcs_requested.pcs_tempdown_cores == 0);
8320
8321 assert(pcs.pcs_effective.pcs_powerdown_recommended_cores == pcs.pcs_managed_cores);
8322 assert(pcs.pcs_effective.pcs_online_cores == pcs.pcs_managed_cores);
8323 assert(pcs.pcs_effective.pcs_tempdown_cores == 0);
8324
8325 pcs.pcs_init_completed = true;
8326
8327 simple_unlock(&sched_available_cores_lock);
8328 splx(s);
8329
8330 lck_mtx_unlock(&cluster_powerdown_lock);
8331
8332 /* Release the +1 pcs_powerdown_suspend_count that we booted up with. */
8333 resume_cluster_powerdown();
8334 }
8335
8336 bool
sched_is_in_sleep(void)8337 sched_is_in_sleep(void)
8338 {
8339 return pcs.pcs_in_kernel_sleep || pcs.pcs_wants_kernel_sleep;
8340 }
8341
8342 bool
sched_is_cpu_init_completed(void)8343 sched_is_cpu_init_completed(void)
8344 {
8345 return pcs.pcs_init_completed;
8346 }
8347
8348 processor_reason_t last_sched_update_powered_cores_continue_reason;
8349
8350 static void OS_NORETURN
sched_update_powered_cores_continue(void * param __unused,wait_result_t wr __unused)8351 sched_update_powered_cores_continue(void *param __unused, wait_result_t wr __unused)
8352 {
8353 sched_cond_ack(&sched_update_powered_cores_wakeup);
8354
8355 while (true) {
8356 lck_mtx_lock(&cluster_powerdown_lock);
8357
8358 spl_t s = splsched();
8359 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8360
8361 bool needs_update = sched_needs_update_requested_powered_cores();
8362
8363 if (needs_update) {
8364 /* This thread shouldn't need to make changes while powerdown is suspended */
8365 assert(pcs.pcs_powerdown_suspend_count == 0);
8366
8367 processor_reason_t reason = last_sched_update_powered_cores_continue_reason;
8368
8369 sched_update_powered_cores_drops_lock(reason, s);
8370 }
8371
8372 simple_unlock(&sched_available_cores_lock);
8373 splx(s);
8374
8375 lck_mtx_unlock(&cluster_powerdown_lock);
8376
8377 /* If we did an update, we dropped the lock, so check again. */
8378
8379 if (!needs_update) {
8380 sched_cond_wait(&sched_update_powered_cores_wakeup, THREAD_UNINT,
8381 sched_update_powered_cores_continue);
8382 /* The condition was signaled since we last blocked, check again. */
8383 }
8384 }
8385 }
8386
8387 __options_decl(sched_powered_cores_flags_t, uint32_t, {
8388 ASSERT_IN_SLEEP = 0x10000000,
8389 ASSERT_POWERDOWN_SUSPENDED = 0x20000000,
8390 POWERED_CORES_OPTIONS_MASK = ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED,
8391 });
8392
8393 /*
8394 * This is KPI with CLPC.
8395 */
8396 void
sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,__unused uint32_t flags)8397 sched_perfcontrol_update_powered_cores(
8398 uint64_t requested_powered_cores,
8399 processor_reason_t reason,
8400 __unused uint32_t flags)
8401 {
8402 assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
8403
8404 #if DEVELOPMENT || DEBUG
8405 if (flags & (ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED)) {
8406 if (flags & ASSERT_POWERDOWN_SUSPENDED) {
8407 assert(pcs.pcs_powerdown_suspend_count > 0);
8408 }
8409 if (flags & ASSERT_IN_SLEEP) {
8410 assert(pcs.pcs_sleep_override_recommended == true);
8411 }
8412 return;
8413 }
8414 #endif
8415
8416 spl_t s = splsched();
8417 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8418
8419 cpumap_t requested_cores = requested_powered_cores & pcs.pcs_managed_cores;
8420
8421 if (reason == REASON_CLPC_SYSTEM) {
8422 pcs.pcs_requested_online_clpc_system = requested_cores;
8423 } else if (reason == REASON_CLPC_USER) {
8424 pcs.pcs_requested_online_clpc_user = requested_cores;
8425 }
8426
8427 bool needs_update = sched_needs_update_requested_powered_cores();
8428
8429 if (needs_update) {
8430 last_sched_update_powered_cores_continue_reason = reason;
8431 }
8432
8433 simple_unlock(&sched_available_cores_lock);
8434 splx(s);
8435
8436 if (needs_update) {
8437 sched_cond_signal(&sched_update_powered_cores_wakeup,
8438 sched_update_powered_cores_thread);
8439 }
8440 }
8441
8442 /*
8443 * This doesn't just suspend cluster powerdown.
8444 * It also powers up all the cores and leaves them up,
8445 * even if some user wanted them down.
8446 * This is important because dtrace, monotonic, and others can't handle any
8447 * powered down cores, not just cluster powerdown.
8448 */
8449 static void
suspend_cluster_powerdown_locked(bool for_sleep)8450 suspend_cluster_powerdown_locked(bool for_sleep)
8451 {
8452 lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
8453 kprintf("%s>calling sched_update_powered_cores to suspend powerdown\n", __func__);
8454
8455 spl_t s = splsched();
8456 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8457
8458 assert(pcs.pcs_powerdown_suspend_count >= 0);
8459
8460 if (for_sleep) {
8461 assert(!pcs.pcs_wants_kernel_sleep);
8462 assert(!pcs.pcs_in_kernel_sleep);
8463 pcs.pcs_wants_kernel_sleep = true;
8464 }
8465
8466 pcs.pcs_powerdown_suspend_count++;
8467
8468 if (sched_needs_update_requested_powered_cores()) {
8469 sched_update_powered_cores_drops_lock(REASON_SYSTEM, s);
8470 }
8471
8472 if (for_sleep) {
8473 assert(pcs.pcs_wants_kernel_sleep);
8474 assert(!pcs.pcs_in_kernel_sleep);
8475 pcs.pcs_in_kernel_sleep = true;
8476
8477 assert(sched_needs_update_requested_powered_cores() == false);
8478 }
8479
8480 simple_unlock(&sched_available_cores_lock);
8481 splx(s);
8482
8483 if (pcs.pcs_init_completed) {
8484 /* At this point, no cpu should be still starting. Let's enforce that. */
8485 check_all_cpus_are_done_starting(for_sleep ?
8486 PROCESSOR_BEFORE_ENTERING_SLEEP : PROCESSOR_CLUSTER_POWERDOWN_SUSPEND);
8487 }
8488 }
8489
8490 static void
resume_cluster_powerdown_locked(bool for_sleep)8491 resume_cluster_powerdown_locked(bool for_sleep)
8492 {
8493 lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
8494
8495 if (pcs.pcs_init_completed) {
8496 /* At this point, no cpu should be still starting. Let's enforce that. */
8497 check_all_cpus_are_done_starting(for_sleep ?
8498 PROCESSOR_WAKE_FROM_SLEEP : PROCESSOR_CLUSTER_POWERDOWN_RESUME);
8499 }
8500
8501 kprintf("%s>calling sched_update_powered_cores to resume powerdown\n", __func__);
8502
8503 spl_t s = splsched();
8504 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8505
8506 if (pcs.pcs_powerdown_suspend_count <= 0) {
8507 panic("resume_cluster_powerdown() called with pcs.pcs_powerdown_suspend_count=%d\n", pcs.pcs_powerdown_suspend_count);
8508 }
8509
8510 if (for_sleep) {
8511 assert(pcs.pcs_wants_kernel_sleep);
8512 assert(pcs.pcs_in_kernel_sleep);
8513 pcs.pcs_wants_kernel_sleep = false;
8514 }
8515
8516 pcs.pcs_powerdown_suspend_count--;
8517
8518 if (pcs.pcs_powerdown_suspend_count == 0) {
8519 /* Returning to client controlled powerdown mode */
8520 assert(pcs.pcs_init_completed);
8521
8522 /* To match previous behavior, clear the user state */
8523 pcs.pcs_requested_online_user = pcs.pcs_managed_cores;
8524 pcs.pcs_user_online_core_control = false;
8525
8526 /* To match previous behavior, clear the requested CLPC state. */
8527 pcs.pcs_requested_online_clpc_user = pcs.pcs_managed_cores;
8528 pcs.pcs_requested_online_clpc_system = pcs.pcs_managed_cores;
8529 }
8530
8531 if (sched_needs_update_requested_powered_cores()) {
8532 sched_update_powered_cores_drops_lock(REASON_SYSTEM, s);
8533 }
8534
8535 if (for_sleep) {
8536 assert(!pcs.pcs_wants_kernel_sleep);
8537 assert(pcs.pcs_in_kernel_sleep);
8538 pcs.pcs_in_kernel_sleep = false;
8539
8540 assert(sched_needs_update_requested_powered_cores() == false);
8541 }
8542
8543 simple_unlock(&sched_available_cores_lock);
8544 splx(s);
8545 }
8546
8547 static uint64_t
die_and_cluster_to_cpu_mask(__unused unsigned int die_id,__unused unsigned int die_cluster_id)8548 die_and_cluster_to_cpu_mask(
8549 __unused unsigned int die_id,
8550 __unused unsigned int die_cluster_id)
8551 {
8552 #if __arm__ || __arm64__
8553 const ml_topology_info_t* topology = ml_get_topology_info();
8554 unsigned int num_clusters = topology->num_clusters;
8555 for (unsigned int i = 0; i < num_clusters; i++) {
8556 ml_topology_cluster_t* cluster = &topology->clusters[i];
8557 if ((cluster->die_id == die_id) &&
8558 (cluster->die_cluster_id == die_cluster_id)) {
8559 return cluster->cpu_mask;
8560 }
8561 }
8562 #endif
8563 return 0ull;
8564 }
8565
8566 /*
8567 * Take an assertion that ensures all CPUs in the cluster are powered up until
8568 * the assertion is released.
8569 * A system suspend will still power down the CPUs.
8570 * This call will stall if system suspend is in progress.
8571 *
8572 * Future ER: Could this just power up the cluster, and leave enabling the
8573 * processors to be asynchronous, or deferred?
8574 *
8575 * Enabling the rail is synchronous, it must be powered up before returning.
8576 */
8577 void
sched_enable_acc_rail(unsigned int die_id,unsigned int die_cluster_id)8578 sched_enable_acc_rail(unsigned int die_id, unsigned int die_cluster_id)
8579 {
8580 uint64_t core_mask = die_and_cluster_to_cpu_mask(die_id, die_cluster_id);
8581
8582 lck_mtx_lock(&cluster_powerdown_lock);
8583
8584 /*
8585 * Note: if pcs.pcs_init_completed is false, because the
8586 * CPUs have not booted yet, then we assume that all
8587 * clusters are already powered up at boot (see IOCPUInitialize)
8588 * so we don't have to wait for cpu boot to complete.
8589 * We'll still save the requested assertion and enforce it after
8590 * boot completes.
8591 */
8592
8593 spl_t s = splsched();
8594 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8595
8596 if (pcs.pcs_init_completed) {
8597 assert3u(pcs.pcs_managed_cores & core_mask, ==, core_mask);
8598 }
8599
8600 /* Can't enable something that is already enabled */
8601 assert((pcs.pcs_required_online_pmgr & core_mask) == 0);
8602
8603 pcs.pcs_required_online_pmgr |= core_mask;
8604
8605 if (sched_needs_update_requested_powered_cores()) {
8606 sched_update_powered_cores_drops_lock(REASON_PMGR_SYSTEM, s);
8607 }
8608
8609 simple_unlock(&sched_available_cores_lock);
8610 splx(s);
8611
8612 lck_mtx_unlock(&cluster_powerdown_lock);
8613 }
8614
8615 /*
8616 * Release the assertion ensuring the cluster is powered up.
8617 * This operation is asynchronous, so PMGR doesn't need to wait until it takes
8618 * effect. If the enable comes in before it takes effect, it'll either
8619 * wait on the lock, or the async thread will discover it needs no update.
8620 */
8621 void
sched_disable_acc_rail(unsigned int die_id,unsigned int die_cluster_id)8622 sched_disable_acc_rail(unsigned int die_id, unsigned int die_cluster_id)
8623 {
8624 uint64_t core_mask = die_and_cluster_to_cpu_mask(die_id, die_cluster_id);
8625
8626 spl_t s = splsched();
8627 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8628
8629 /* Can't disable something that is already disabled */
8630 assert((pcs.pcs_required_online_pmgr & core_mask) == core_mask);
8631
8632 if (pcs.pcs_init_completed) {
8633 assert3u(pcs.pcs_managed_cores & core_mask, ==, core_mask);
8634 }
8635
8636 pcs.pcs_required_online_pmgr &= ~core_mask;
8637
8638 bool needs_update = sched_needs_update_requested_powered_cores();
8639
8640 if (needs_update) {
8641 last_sched_update_powered_cores_continue_reason = REASON_PMGR_SYSTEM;
8642 }
8643
8644 simple_unlock(&sched_available_cores_lock);
8645 splx(s);
8646
8647 if (needs_update) {
8648 sched_cond_signal(&sched_update_powered_cores_wakeup,
8649 sched_update_powered_cores_thread);
8650 }
8651 }
8652
8653 void
suspend_cluster_powerdown(void)8654 suspend_cluster_powerdown(void)
8655 {
8656 lck_mtx_lock(&cluster_powerdown_lock);
8657 suspend_cluster_powerdown_locked(false);
8658 lck_mtx_unlock(&cluster_powerdown_lock);
8659 }
8660
8661 void
resume_cluster_powerdown(void)8662 resume_cluster_powerdown(void)
8663 {
8664 lck_mtx_lock(&cluster_powerdown_lock);
8665 resume_cluster_powerdown_locked(false);
8666 lck_mtx_unlock(&cluster_powerdown_lock);
8667
8668 #if CONFIG_SCHED_SMT
8669 if (sched_enable_smt == 0) {
8670 enable_smt_processors(false);
8671 }
8672 #endif /* CONFIG_SCHED_SMT */
8673 }
8674
8675
8676 LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp);
8677 static bool user_suspended_cluster_powerdown = false;
8678
8679 kern_return_t
suspend_cluster_powerdown_from_user(void)8680 suspend_cluster_powerdown_from_user(void)
8681 {
8682 kern_return_t ret = KERN_FAILURE;
8683
8684 lck_mtx_lock(&user_cluster_powerdown_lock);
8685
8686 if (!user_suspended_cluster_powerdown) {
8687 suspend_cluster_powerdown();
8688 user_suspended_cluster_powerdown = true;
8689 ret = KERN_SUCCESS;
8690 }
8691
8692 lck_mtx_unlock(&user_cluster_powerdown_lock);
8693
8694 return ret;
8695 }
8696
8697 kern_return_t
resume_cluster_powerdown_from_user(void)8698 resume_cluster_powerdown_from_user(void)
8699 {
8700 kern_return_t ret = KERN_FAILURE;
8701
8702 lck_mtx_lock(&user_cluster_powerdown_lock);
8703
8704 if (user_suspended_cluster_powerdown) {
8705 resume_cluster_powerdown();
8706 user_suspended_cluster_powerdown = false;
8707 ret = KERN_SUCCESS;
8708 }
8709
8710 lck_mtx_unlock(&user_cluster_powerdown_lock);
8711
8712 return ret;
8713 }
8714
8715 int
get_cluster_powerdown_user_suspended(void)8716 get_cluster_powerdown_user_suspended(void)
8717 {
8718 lck_mtx_lock(&user_cluster_powerdown_lock);
8719
8720 int ret = (int)user_suspended_cluster_powerdown;
8721
8722 lck_mtx_unlock(&user_cluster_powerdown_lock);
8723
8724 return ret;
8725 }
8726
8727 #if DEVELOPMENT || DEBUG
8728 /* Functions to support the temporary sysctl */
8729 static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED;
8730 void
sched_set_powered_cores(int requested_powered_cores)8731 sched_set_powered_cores(int requested_powered_cores)
8732 {
8733 processor_reason_t reason = bit_test(requested_powered_cores, 31) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM;
8734 sched_powered_cores_flags_t flags = requested_powered_cores & POWERED_CORES_OPTIONS_MASK;
8735
8736 saved_requested_powered_cores = requested_powered_cores;
8737
8738 requested_powered_cores = bits(requested_powered_cores, 28, 0);
8739
8740 sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags);
8741 }
8742 int
sched_get_powered_cores(void)8743 sched_get_powered_cores(void)
8744 {
8745 return (int)saved_requested_powered_cores;
8746 }
8747
8748 uint64_t
sched_sysctl_get_recommended_cores(void)8749 sched_sysctl_get_recommended_cores(void)
8750 {
8751 return pcs.pcs_recommended_cores;
8752 }
8753 #endif
8754
8755 /*
8756 * Ensure that all cores are powered and recommended before sleep
8757 * Acquires cluster_powerdown_lock and returns with it held.
8758 */
8759 void
sched_override_available_cores_for_sleep(void)8760 sched_override_available_cores_for_sleep(void)
8761 {
8762 if (!pcs.pcs_init_completed) {
8763 panic("Attempting to sleep before all CPUS are registered");
8764 }
8765
8766 lck_mtx_lock(&cluster_powerdown_lock);
8767
8768 spl_t s = splsched();
8769 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8770
8771 assert(pcs.pcs_sleep_override_recommended == false);
8772
8773 pcs.pcs_sleep_override_recommended = true;
8774 sched_update_recommended_cores_locked(REASON_SYSTEM, 0);
8775
8776 simple_unlock(&sched_available_cores_lock);
8777 splx(s);
8778
8779 suspend_cluster_powerdown_locked(true);
8780 }
8781
8782 /*
8783 * Restore the previously recommended cores, but leave all cores powered
8784 * after sleep.
8785 * Called with cluster_powerdown_lock still held, releases the lock.
8786 */
8787 void
sched_restore_available_cores_after_sleep(void)8788 sched_restore_available_cores_after_sleep(void)
8789 {
8790 lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
8791
8792 spl_t s = splsched();
8793 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8794 assert(pcs.pcs_sleep_override_recommended == true);
8795
8796 pcs.pcs_sleep_override_recommended = false;
8797 sched_update_recommended_cores_locked(REASON_NONE, 0);
8798
8799 simple_unlock(&sched_available_cores_lock);
8800 splx(s);
8801
8802 resume_cluster_powerdown_locked(true);
8803
8804 lck_mtx_unlock(&cluster_powerdown_lock);
8805
8806 #if CONFIG_SCHED_SMT
8807 if (sched_enable_smt == 0) {
8808 enable_smt_processors(false);
8809 }
8810 #endif /* CONFIG_SCHED_SMT */
8811 }
8812
8813 #if __arm__ || __arm64__
8814
8815 uint64_t perfcontrol_failsafe_maintenance_runnable_time;
8816 uint64_t perfcontrol_failsafe_activation_time;
8817 uint64_t perfcontrol_failsafe_deactivation_time;
8818
8819 /* data covering who likely caused it and how long they ran */
8820 #define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */
8821 char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
8822 int perfcontrol_failsafe_pid;
8823 uint64_t perfcontrol_failsafe_tid;
8824 uint64_t perfcontrol_failsafe_thread_timer_at_start;
8825 uint64_t perfcontrol_failsafe_thread_timer_last_seen;
8826 uint64_t perfcontrol_failsafe_recommended_at_trigger;
8827
8828 /*
8829 * Perf controller calls here to update the recommended core bitmask.
8830 * If the failsafe is active, we don't immediately apply the new value.
8831 * Instead, we store the new request and use it after the failsafe deactivates.
8832 *
8833 * If the failsafe is not active, immediately apply the update.
8834 *
8835 * No scheduler locks are held, no other locks are held that scheduler might depend on,
8836 * interrupts are enabled
8837 *
8838 * currently prototype is in osfmk/arm/machine_routines.h
8839 */
8840 void
sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores,processor_reason_t reason,__unused uint32_t flags)8841 sched_perfcontrol_update_recommended_cores_reason(
8842 uint64_t recommended_cores,
8843 processor_reason_t reason,
8844 __unused uint32_t flags)
8845 {
8846 assert(preemption_enabled());
8847
8848 spl_t s = splsched();
8849 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8850
8851 if (reason == REASON_CLPC_SYSTEM) {
8852 pcs.pcs_requested_recommended_clpc_system = recommended_cores;
8853 } else {
8854 assert(reason == REASON_CLPC_USER);
8855 pcs.pcs_requested_recommended_clpc_user = recommended_cores;
8856 }
8857
8858 pcs.pcs_requested_recommended_clpc = pcs.pcs_requested_recommended_clpc_system &
8859 pcs.pcs_requested_recommended_clpc_user;
8860
8861 sysctl_sched_recommended_cores = pcs.pcs_requested_recommended_clpc;
8862
8863 sched_update_recommended_cores_locked(reason, 0);
8864
8865 simple_unlock(&sched_available_cores_lock);
8866 splx(s);
8867 }
8868
8869 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)8870 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
8871 {
8872 sched_perfcontrol_update_recommended_cores_reason(recommended_cores, REASON_CLPC_USER, 0);
8873 }
8874
8875 /*
8876 * Consider whether we need to activate the recommended cores failsafe
8877 *
8878 * Called from quantum timer interrupt context of a realtime thread
8879 * No scheduler locks are held, interrupts are disabled
8880 */
8881 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)8882 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
8883 {
8884 /*
8885 * Check if a realtime thread is starving the system
8886 * and bringing up non-recommended cores would help
8887 *
8888 * TODO: Is this the correct check for recommended == possible cores?
8889 * TODO: Validate the checks without the relevant lock are OK.
8890 */
8891
8892 if (__improbable(pcs.pcs_recommended_clpc_failsafe_active)) {
8893 /* keep track of how long the responsible thread runs */
8894 uint64_t cur_th_time = recount_current_thread_time_mach();
8895
8896 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8897
8898 if (pcs.pcs_recommended_clpc_failsafe_active &&
8899 cur_thread->thread_id == perfcontrol_failsafe_tid) {
8900 perfcontrol_failsafe_thread_timer_last_seen = cur_th_time;
8901 }
8902
8903 simple_unlock(&sched_available_cores_lock);
8904
8905 /* we're already trying to solve the problem, so bail */
8906 return;
8907 }
8908
8909 /* The failsafe won't help if there are no more processors to enable */
8910 if (__probable(bit_count(pcs.pcs_requested_recommended_clpc) >= processor_count)) {
8911 return;
8912 }
8913
8914 uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
8915
8916 /* Use the maintenance thread as our canary in the coal mine */
8917 thread_t m_thread = sched_maintenance_thread;
8918
8919 /* If it doesn't look bad, nothing to see here */
8920 if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
8921 return;
8922 }
8923
8924 /* It looks bad, take the lock to be sure */
8925 thread_lock(m_thread);
8926
8927 if (thread_get_runq(m_thread) == PROCESSOR_NULL ||
8928 (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
8929 m_thread->last_made_runnable_time >= too_long_ago) {
8930 /*
8931 * Maintenance thread is either on cpu or blocked, and
8932 * therefore wouldn't benefit from more cores
8933 */
8934 thread_unlock(m_thread);
8935 return;
8936 }
8937
8938 uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
8939
8940 thread_unlock(m_thread);
8941
8942 /*
8943 * There are cores disabled at perfcontrol's recommendation, but the
8944 * system is so overloaded that the maintenance thread can't run.
8945 * That likely means that perfcontrol can't run either, so it can't fix
8946 * the recommendation. We have to kick in a failsafe to keep from starving.
8947 *
8948 * When the maintenance thread has been starved for too long,
8949 * ignore the recommendation from perfcontrol and light up all the cores.
8950 *
8951 * TODO: Consider weird states like boot, sleep, or debugger
8952 */
8953
8954 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
8955
8956 if (pcs.pcs_recommended_clpc_failsafe_active) {
8957 simple_unlock(&sched_available_cores_lock);
8958 return;
8959 }
8960
8961 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
8962 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
8963 pcs.pcs_requested_recommended_clpc, maintenance_runnable_time, 0, 0, 0);
8964
8965 pcs.pcs_recommended_clpc_failsafe_active = true;
8966 perfcontrol_failsafe_activation_time = mach_absolute_time();
8967 perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
8968 perfcontrol_failsafe_recommended_at_trigger = pcs.pcs_requested_recommended_clpc;
8969
8970 /* Capture some data about who screwed up (assuming that the thread on core is at fault) */
8971 task_t task = get_threadtask(cur_thread);
8972 perfcontrol_failsafe_pid = task_pid(task);
8973 strlcpy(perfcontrol_failsafe_name, proc_name_address(get_bsdtask_info(task)), sizeof(perfcontrol_failsafe_name));
8974
8975 perfcontrol_failsafe_tid = cur_thread->thread_id;
8976
8977 /* Blame the thread for time it has run recently */
8978 uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
8979
8980 uint64_t last_seen = recount_current_thread_time_mach();
8981
8982 /* Compute the start time of the bad behavior in terms of the thread's on core time */
8983 perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
8984 perfcontrol_failsafe_thread_timer_last_seen = last_seen;
8985
8986 /* Publish the pcs_recommended_clpc_failsafe_active override to the CPUs */
8987 sched_update_recommended_cores_locked(REASON_SYSTEM, 0);
8988
8989 simple_unlock(&sched_available_cores_lock);
8990 }
8991
8992 /*
8993 * Now that our bacon has been saved by the failsafe, consider whether to turn it off
8994 *
8995 * Runs in the context of the maintenance thread, no locks held
8996 */
8997 static void
sched_recommended_cores_maintenance(void)8998 sched_recommended_cores_maintenance(void)
8999 {
9000 /* Common case - no failsafe, nothing to be done here */
9001 if (__probable(!pcs.pcs_recommended_clpc_failsafe_active)) {
9002 return;
9003 }
9004
9005 uint64_t ctime = mach_absolute_time();
9006
9007 boolean_t print_diagnostic = FALSE;
9008 char p_name[FAILSAFE_NAME_LEN] = "";
9009
9010 spl_t s = splsched();
9011 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
9012
9013 /* Check again, under the lock, to avoid races */
9014 if (!pcs.pcs_recommended_clpc_failsafe_active) {
9015 goto out;
9016 }
9017
9018 /*
9019 * Ensure that the other cores get another few ticks to run some threads
9020 * If we don't have this hysteresis, the maintenance thread is the first
9021 * to run, and then it immediately kills the other cores
9022 */
9023 if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
9024 goto out;
9025 }
9026
9027 /* Capture some diagnostic state under the lock so we can print it out later */
9028
9029 int pid = perfcontrol_failsafe_pid;
9030 uint64_t tid = perfcontrol_failsafe_tid;
9031
9032 uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
9033 perfcontrol_failsafe_thread_timer_at_start;
9034 uint64_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
9035 uint64_t rec_cores_after = pcs.pcs_requested_recommended_clpc;
9036 uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
9037 strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
9038
9039 print_diagnostic = TRUE;
9040
9041 /* Deactivate the failsafe and reinstate the requested recommendation settings */
9042
9043 perfcontrol_failsafe_deactivation_time = ctime;
9044 pcs.pcs_recommended_clpc_failsafe_active = false;
9045
9046 sched_update_recommended_cores_locked(REASON_SYSTEM, 0);
9047
9048 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
9049 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
9050 pcs.pcs_requested_recommended_clpc, failsafe_duration, 0, 0, 0);
9051
9052 out:
9053 simple_unlock(&sched_available_cores_lock);
9054 splx(s);
9055
9056 if (print_diagnostic) {
9057 uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
9058
9059 absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
9060 failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
9061
9062 absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
9063 thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
9064
9065 printf("recommended core failsafe kicked in for %lld ms "
9066 "likely due to %s[%d] thread 0x%llx spending "
9067 "%lld ms on cpu at realtime priority - "
9068 "new recommendation: 0x%llx -> 0x%llx\n",
9069 failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
9070 rec_cores_before, rec_cores_after);
9071 }
9072 }
9073
9074 #endif /* __arm64__ */
9075
9076 /*
9077 * This is true before we have jumped to kernel_bootstrap_thread
9078 * first thread context during boot, or while all processors
9079 * have offlined during system sleep and the scheduler is disabled.
9080 *
9081 * (Note: only ever true on ARM, Intel doesn't actually offline the last CPU)
9082 */
9083 bool
sched_all_cpus_offline(void)9084 sched_all_cpus_offline(void)
9085 {
9086 return pcs.pcs_effective.pcs_online_cores == 0;
9087 }
9088
9089 void
sched_assert_not_last_online_cpu(__assert_only int cpu_id)9090 sched_assert_not_last_online_cpu(__assert_only int cpu_id)
9091 {
9092 assertf(pcs.pcs_effective.pcs_online_cores != BIT(cpu_id),
9093 "attempting to shut down the last online CPU!");
9094 }
9095
9096 /*
9097 * This is the unified single function to change published active core counts based on processor mode.
9098 * Each type of flag affects the other in terms of how the counts change.
9099 *
9100 * Future: Add support for not decrementing counts in 'temporary derecommended online' mode
9101 * Future: Shutdown for system sleep should be 'temporary' according to the user counts
9102 * so that no client sees a transiently low number of CPUs.
9103 */
9104 void
sched_processor_change_mode_locked(processor_t processor,processor_mode_t pcm_mode,bool set)9105 sched_processor_change_mode_locked(processor_t processor, processor_mode_t pcm_mode, bool set)
9106 {
9107 simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
9108 pset_assert_locked(processor->processor_set);
9109
9110 switch (pcm_mode) {
9111 case PCM_RECOMMENDED:
9112 if (set) {
9113 assert(!processor->is_recommended);
9114 assert(!bit_test(pcs.pcs_recommended_cores, processor->cpu_id));
9115
9116 processor->is_recommended = true;
9117 bit_set(pcs.pcs_recommended_cores, processor->cpu_id);
9118
9119 if (processor->processor_online) {
9120 os_atomic_inc(&processor_avail_count_user, relaxed);
9121 #if CONFIG_SCHED_SMT
9122 if (processor->processor_primary == processor) {
9123 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
9124 }
9125 #endif /* CONFIG_SCHED_SMT */
9126 }
9127 } else {
9128 assert(processor->is_recommended);
9129 assert(bit_test(pcs.pcs_recommended_cores, processor->cpu_id));
9130
9131 processor->is_recommended = false;
9132 bit_clear(pcs.pcs_recommended_cores, processor->cpu_id);
9133
9134 if (processor->processor_online) {
9135 os_atomic_dec(&processor_avail_count_user, relaxed);
9136 #if CONFIG_SCHED_SMT
9137 if (processor->processor_primary == processor) {
9138 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
9139 }
9140 #endif /* CONFIG_SCHED_SMT */
9141 }
9142 }
9143 break;
9144 case PCM_TEMPORARY:
9145 if (set) {
9146 assert(!processor->shutdown_temporary);
9147 assert(!bit_test(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id));
9148
9149 processor->shutdown_temporary = true;
9150 bit_set(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id);
9151
9152 if (!processor->processor_online) {
9153 goto counts_up;
9154 }
9155 } else {
9156 assert(processor->shutdown_temporary);
9157 assert(bit_test(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id));
9158
9159 processor->shutdown_temporary = false;
9160 bit_clear(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id);
9161
9162 if (!processor->processor_online) {
9163 goto counts_down;
9164 }
9165 }
9166 break;
9167 case PCM_ONLINE:
9168 if (set) {
9169 assert(!processor->processor_online);
9170 assert(!bit_test(pcs.pcs_effective.pcs_online_cores, processor->cpu_id));
9171 processor->processor_online = true;
9172 bit_set(pcs.pcs_effective.pcs_online_cores, processor->cpu_id);
9173
9174 if (!processor->shutdown_temporary) {
9175 goto counts_up;
9176 }
9177 } else {
9178 assert(processor->processor_online);
9179 assert(bit_test(pcs.pcs_effective.pcs_online_cores, processor->cpu_id));
9180 processor->processor_online = false;
9181 bit_clear(pcs.pcs_effective.pcs_online_cores, processor->cpu_id);
9182
9183 if (!processor->shutdown_temporary) {
9184 goto counts_down;
9185 }
9186 }
9187 break;
9188 default:
9189 panic("unknown mode %d", pcm_mode);
9190 }
9191
9192 return;
9193
9194 counts_up:
9195 ml_cpu_up_update_counts(processor->cpu_id);
9196
9197 os_atomic_inc(&processor_avail_count, relaxed);
9198
9199 if (processor->is_recommended) {
9200 os_atomic_inc(&processor_avail_count_user, relaxed);
9201 #if CONFIG_SCHED_SMT
9202 if (processor->processor_primary == processor) {
9203 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
9204 }
9205 #endif /* CONFIG_SCHED_SMT */
9206 }
9207 commpage_update_active_cpus();
9208
9209 return;
9210
9211 counts_down:
9212 ml_cpu_down_update_counts(processor->cpu_id);
9213
9214 os_atomic_dec(&processor_avail_count, relaxed);
9215
9216 if (processor->is_recommended) {
9217 os_atomic_dec(&processor_avail_count_user, relaxed);
9218 #if CONFIG_SCHED_SMT
9219 if (processor->processor_primary == processor) {
9220 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
9221 }
9222 #endif /* CONFIG_SCHED_SMT */
9223 }
9224 commpage_update_active_cpus();
9225
9226 return;
9227 }
9228
9229 bool
sched_mark_processor_online(processor_t processor,__assert_only processor_reason_t reason)9230 sched_mark_processor_online(processor_t processor, __assert_only processor_reason_t reason)
9231 {
9232 assert(processor == current_processor());
9233
9234 processor_set_t pset = processor->processor_set;
9235
9236 spl_t s = splsched();
9237 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
9238 pset_lock(pset);
9239
9240 /* Boot CPU coming online for the first time, either at boot or after sleep */
9241 bool is_first_online_processor = sched_all_cpus_offline();
9242 if (is_first_online_processor) {
9243 assert(processor == master_processor);
9244 }
9245
9246 assert((processor != master_processor) || (reason == REASON_SYSTEM) || support_bootcpu_shutdown);
9247
9248 sched_processor_change_mode_locked(processor, PCM_ONLINE, true);
9249
9250 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_STARTING ||
9251 processor->processor_offline_state == PROCESSOR_OFFLINE_STARTED_NOT_RUNNING ||
9252 processor->processor_offline_state == PROCESSOR_OFFLINE_FINAL_SYSTEM_SLEEP);
9253
9254 processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_STARTED_NOT_WAITED);
9255
9256 ++pset->online_processor_count;
9257 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
9258
9259 if (processor->is_recommended) {
9260 SCHED(pset_made_schedulable)(processor, pset, false); /* May relock the pset lock */
9261 }
9262 pset_unlock(pset);
9263
9264 smr_cpu_up(processor, SMR_CPU_REASON_OFFLINE);
9265
9266 simple_unlock(&sched_available_cores_lock);
9267 splx(s);
9268
9269 return is_first_online_processor;
9270 }
9271
9272 void
sched_mark_processor_offline(processor_t processor,bool is_final_system_sleep)9273 sched_mark_processor_offline(processor_t processor, bool is_final_system_sleep)
9274 {
9275 assert(processor == current_processor());
9276
9277 processor_set_t pset = processor->processor_set;
9278
9279 spl_t s = splsched();
9280 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
9281
9282 assert(bit_test(pcs.pcs_effective.pcs_online_cores, processor->cpu_id));
9283 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_BEGIN_SHUTDOWN);
9284
9285 if (!is_final_system_sleep) {
9286 /*
9287 * We can't shut down the last available core!
9288 * Force recommend another CPU if this is the last one.
9289 */
9290
9291 if ((pcs.pcs_effective.pcs_online_cores & pcs.pcs_recommended_cores) == BIT(processor->cpu_id)) {
9292 sched_update_recommended_cores_locked(REASON_SYSTEM, BIT(processor->cpu_id));
9293 }
9294
9295 /* If we're still the last one, something went wrong. */
9296 if ((pcs.pcs_effective.pcs_online_cores & pcs.pcs_recommended_cores) == BIT(processor->cpu_id)) {
9297 panic("shutting down the last available core! online: 0x%llx rec: 0x%llxx",
9298 pcs.pcs_effective.pcs_online_cores,
9299 pcs.pcs_recommended_cores);
9300 }
9301 }
9302
9303 pset_lock(pset);
9304 assert(processor->state == PROCESSOR_RUNNING);
9305 assert(processor->processor_inshutdown);
9306 pset_update_processor_state(pset, processor, PROCESSOR_PENDING_OFFLINE);
9307 --pset->online_processor_count;
9308
9309 sched_processor_change_mode_locked(processor, PCM_ONLINE, false);
9310
9311 if (is_final_system_sleep) {
9312 assert3u(pcs.pcs_effective.pcs_online_cores, ==, 0);
9313 assert(processor == master_processor);
9314 assert(sched_all_cpus_offline());
9315
9316 processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_FINAL_SYSTEM_SLEEP);
9317 } else {
9318 processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_PENDING_OFFLINE);
9319 }
9320
9321 simple_unlock(&sched_available_cores_lock);
9322
9323 SCHED(processor_queue_shutdown)(processor);
9324 /* pset lock dropped */
9325 SCHED(rt_queue_shutdown)(processor);
9326
9327 splx(s);
9328 }
9329
9330 /*
9331 * Apply a new recommended cores mask to the processors it affects
9332 * Runs after considering failsafes and such
9333 *
9334 * Iterate over processors and update their ->is_recommended field.
9335 * If a processor is running, we let it drain out at its next
9336 * quantum expiration or blocking point. If a processor is idle, there
9337 * may be more work for it to do, so IPI it.
9338 *
9339 * interrupts disabled, sched_available_cores_lock is held
9340 *
9341 * If a core is about to go offline, its bit will be set in core_going_offline,
9342 * so we can make sure not to pick it as the last resort cpu.
9343 */
9344 static void
sched_update_recommended_cores_locked(processor_reason_t reason,cpumap_t core_going_offline)9345 sched_update_recommended_cores_locked(processor_reason_t reason,
9346 cpumap_t core_going_offline)
9347 {
9348 simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
9349
9350 cpumap_t recommended_cores = pcs.pcs_requested_recommended_clpc;
9351
9352 if (pcs.pcs_init_completed) {
9353 recommended_cores &= pcs.pcs_effective.pcs_powerdown_recommended_cores;
9354 }
9355
9356 if (pcs.pcs_sleep_override_recommended || pcs.pcs_recommended_clpc_failsafe_active) {
9357 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
9358 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
9359 recommended_cores,
9360 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
9361
9362 recommended_cores = pcs.pcs_managed_cores;
9363 }
9364
9365 if (bit_count(recommended_cores & pcs.pcs_effective.pcs_online_cores & ~core_going_offline) == 0) {
9366 /*
9367 * If there are no online cpus recommended,
9368 * then the system will make no forward progress.
9369 * Pick a CPU of last resort to avoid hanging.
9370 */
9371 int last_resort;
9372
9373 if (!support_bootcpu_shutdown) {
9374 /* We know the master_processor is always available */
9375 last_resort = master_processor->cpu_id;
9376 } else {
9377 /* Pick some still-online processor to be the processor of last resort */
9378 last_resort = lsb_first(pcs.pcs_effective.pcs_online_cores & ~core_going_offline);
9379
9380 if (last_resort == -1) {
9381 panic("%s> no last resort cpu found: 0x%llx 0x%llx",
9382 __func__, pcs.pcs_effective.pcs_online_cores, core_going_offline);
9383 }
9384 }
9385
9386 bit_set(recommended_cores, last_resort);
9387 }
9388
9389 if (pcs.pcs_recommended_cores == recommended_cores) {
9390 /* Nothing to do */
9391 return;
9392 }
9393
9394 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) |
9395 DBG_FUNC_START,
9396 recommended_cores,
9397 pcs.pcs_recommended_clpc_failsafe_active, pcs.pcs_sleep_override_recommended, 0);
9398
9399 cpumap_t needs_exit_idle_mask = 0x0;
9400
9401 /* First set recommended cores */
9402 foreach_node(node) {
9403 foreach_pset_id(pset_id, node) {
9404 processor_set_t pset = pset_array[pset_id];
9405
9406 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
9407 cpumap_t newly_recommended = changed_recommendations & recommended_cores;
9408
9409 if (newly_recommended == 0) {
9410 /* Nothing to do */
9411 continue;
9412 }
9413
9414 pset_lock(pset);
9415
9416 cpumap_foreach(cpu_id, newly_recommended) {
9417 processor_t processor = processor_array[cpu_id];
9418
9419 sched_processor_change_mode_locked(processor, PCM_RECOMMENDED, true);
9420
9421 processor->last_recommend_reason = reason;
9422
9423 if (pset->recommended_bitmask == 0) {
9424 /* Cluster is becoming available for scheduling */
9425 atomic_bit_set(&pset->node->pset_recommended_map, pset->pset_id, memory_order_relaxed);
9426 }
9427 bit_set(pset->recommended_bitmask, processor->cpu_id);
9428
9429 if (processor->state == PROCESSOR_IDLE) {
9430 if (processor != current_processor()) {
9431 bit_set(needs_exit_idle_mask, processor->cpu_id);
9432 }
9433 }
9434
9435 if (processor->processor_online) {
9436 SCHED(pset_made_schedulable)(processor, pset, false); /* May relock the pset lock */
9437 }
9438 }
9439 pset_update_rt_stealable_state(pset);
9440
9441 pset_unlock(pset);
9442
9443 cpumap_foreach(cpu_id, newly_recommended) {
9444 smr_cpu_up(processor_array[cpu_id],
9445 SMR_CPU_REASON_IGNORED);
9446 }
9447 }
9448 }
9449
9450 /* Now shutdown not recommended cores */
9451 foreach_node(node) {
9452 foreach_pset_id(pset_id, node) {
9453 processor_set_t pset = pset_array[pset_id];
9454
9455 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
9456 cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
9457
9458 if (newly_unrecommended == 0) {
9459 /* Nothing to do */
9460 continue;
9461 }
9462
9463 cpumap_foreach(cpu_id, newly_unrecommended) {
9464 processor_t processor = processor_array[cpu_id];
9465 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
9466
9467 pset_lock(pset);
9468
9469 sched_processor_change_mode_locked(processor, PCM_RECOMMENDED, false);
9470
9471 if (reason != REASON_NONE) {
9472 processor->last_derecommend_reason = reason;
9473 }
9474 bit_clear(pset->recommended_bitmask, processor->cpu_id);
9475 pset_update_rt_stealable_state(pset);
9476 if (pset->recommended_bitmask == 0) {
9477 /* Cluster is becoming unavailable for scheduling */
9478 atomic_bit_clear(&pset->node->pset_recommended_map, pset->pset_id, memory_order_relaxed);
9479 }
9480
9481 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
9482 ipi_type = SCHED_IPI_IMMEDIATE;
9483 }
9484 SCHED(processor_queue_shutdown)(processor);
9485 /* pset unlocked */
9486
9487 SCHED(rt_queue_shutdown)(processor);
9488
9489 if (ipi_type == SCHED_IPI_NONE) {
9490 /*
9491 * If the core is idle,
9492 * we can directly mark the processor
9493 * as "Ignored"
9494 *
9495 * Otherwise, smr will detect this
9496 * during smr_cpu_leave() when the
9497 * processor actually idles.
9498 */
9499 smr_cpu_down(processor, SMR_CPU_REASON_IGNORED);
9500 } else if (processor == current_processor()) {
9501 ast_on(AST_PREEMPT);
9502 } else {
9503 sched_ipi_perform(processor, ipi_type);
9504 }
9505 }
9506 }
9507 }
9508
9509 if (pcs.pcs_init_completed) {
9510 assert3u(pcs.pcs_recommended_cores, ==, recommended_cores);
9511 }
9512
9513 #if defined(__x86_64__)
9514 commpage_update_active_cpus();
9515 #endif
9516 /* Issue all pending IPIs now that the pset lock has been dropped */
9517 cpumap_foreach(cpu_id, needs_exit_idle_mask) {
9518 processor_t processor = processor_array[cpu_id];
9519 machine_signal_idle(processor);
9520 }
9521
9522 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
9523 needs_exit_idle_mask, 0, 0, 0);
9524 }
9525
9526 /*
9527 * Enters with the available cores lock held, returns with it held, but will drop it in the meantime.
9528 * Enters with the cluster_powerdown_lock held, returns with it held, keeps it held.
9529 */
9530 static void
sched_update_powered_cores_drops_lock(processor_reason_t requested_reason,spl_t caller_s)9531 sched_update_powered_cores_drops_lock(processor_reason_t requested_reason, spl_t caller_s)
9532 {
9533 lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
9534 simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED);
9535
9536 assert(ml_get_interrupts_enabled() == false);
9537 assert(caller_s == true); /* Caller must have had interrupts enabled when they took the lock */
9538
9539 /* All transitions should be quiesced before we start changing things */
9540 assert_no_processors_in_transition_locked();
9541
9542 pcs.pcs_in_flight_reason = requested_reason;
9543
9544 struct powered_cores_state requested = sched_compute_requested_powered_cores();
9545 struct powered_cores_state effective = pcs.pcs_effective;
9546
9547 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_START,
9548 requested.pcs_online_cores, requested_reason, 0, effective.pcs_online_cores);
9549
9550 /* The bits that are different and in the new value */
9551 cpumap_t newly_online_cores = (requested.pcs_online_cores ^
9552 effective.pcs_online_cores) & requested.pcs_online_cores;
9553
9554 /* The bits that are different and are not in the new value */
9555 cpumap_t newly_offline_cores = (requested.pcs_online_cores ^
9556 effective.pcs_online_cores) & ~requested.pcs_online_cores;
9557
9558 cpumap_t newly_recommended_cores = (requested.pcs_powerdown_recommended_cores ^
9559 effective.pcs_powerdown_recommended_cores) & requested.pcs_powerdown_recommended_cores;
9560
9561 cpumap_t newly_derecommended_cores = (requested.pcs_powerdown_recommended_cores ^
9562 effective.pcs_powerdown_recommended_cores) & ~requested.pcs_powerdown_recommended_cores;
9563
9564 cpumap_t newly_temporary_cores = (requested.pcs_tempdown_cores ^
9565 effective.pcs_tempdown_cores) & requested.pcs_tempdown_cores;
9566
9567 cpumap_t newly_nontemporary_cores = (requested.pcs_tempdown_cores ^
9568 effective.pcs_tempdown_cores) & ~requested.pcs_tempdown_cores;
9569
9570 /*
9571 * Newly online and derecommended cores should be derecommended
9572 * before powering them up, so they never run around doing stuff
9573 * before we reach the end of this function.
9574 */
9575
9576 cpumap_t newly_online_and_derecommended = newly_online_cores & newly_derecommended_cores;
9577
9578 /*
9579 * Publish the goal state we're working on achieving.
9580 * At the end of this function, pcs_effective will match this.
9581 */
9582 pcs.pcs_requested = requested;
9583
9584 pcs.pcs_effective.pcs_powerdown_recommended_cores |= newly_recommended_cores;
9585 pcs.pcs_effective.pcs_powerdown_recommended_cores &= ~newly_online_and_derecommended;
9586
9587 sched_update_recommended_cores_locked(requested_reason, 0);
9588
9589 simple_unlock(&sched_available_cores_lock);
9590 splx(caller_s);
9591
9592 assert(ml_get_interrupts_enabled() == true);
9593
9594 /* First set powered cores */
9595 cpumap_t started_cores = 0ull;
9596 foreach_node(node) {
9597 foreach_pset_id(pset_id, node) {
9598 processor_set_t pset = pset_array[pset_id];
9599
9600 spl_t s = splsched();
9601 pset_lock(pset);
9602 cpumap_t pset_newly_online = newly_online_cores & pset->cpu_bitmask;
9603
9604 __assert_only cpumap_t pset_online_cores =
9605 pset->cpu_state_map[PROCESSOR_START] |
9606 pset->cpu_state_map[PROCESSOR_IDLE] |
9607 pset->cpu_state_map[PROCESSOR_DISPATCHING] |
9608 pset->cpu_state_map[PROCESSOR_RUNNING];
9609 assert((pset_online_cores & pset_newly_online) == 0);
9610
9611 pset_unlock(pset);
9612 splx(s);
9613
9614 if (pset_newly_online == 0) {
9615 /* Nothing to do */
9616 continue;
9617 }
9618 cpumap_foreach(cpu_id, pset_newly_online) {
9619 processor_start_reason(processor_array[cpu_id], requested_reason);
9620 bit_set(started_cores, cpu_id);
9621 }
9622 }
9623 }
9624
9625 /*
9626 * Wait for processors to finish starting in parallel.
9627 * We never proceed until all newly started processors have finished.
9628 *
9629 * This has the side effect of closing the ml_cpu_up_processors race,
9630 * as all started CPUs must have SIGPdisabled cleared by the time this
9631 * is satisfied. (rdar://124631843)
9632 */
9633 cpumap_foreach(cpu_id, started_cores) {
9634 processor_wait_for_start(processor_array[cpu_id], PROCESSOR_POWERED_CORES_CHANGE);
9635 }
9636
9637 /*
9638 * Update published counts of processors to match new temporary status
9639 * Publish all temporary before nontemporary, so that any readers that
9640 * see a middle state will see a slightly too high count instead of
9641 * ending up seeing a 0 (because that crashes dispatch_apply, ask
9642 * me how I know)
9643 */
9644
9645 spl_t s;
9646 s = splsched();
9647 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
9648
9649 foreach_node(node) {
9650 foreach_pset_id(pset_id, node) {
9651 processor_set_t pset = pset_array[pset_id];
9652
9653 pset_lock(pset);
9654
9655 cpumap_t pset_newly_temporary = newly_temporary_cores & pset->cpu_bitmask;
9656
9657 cpumap_foreach(cpu_id, pset_newly_temporary) {
9658 sched_processor_change_mode_locked(processor_array[cpu_id],
9659 PCM_TEMPORARY, true);
9660 }
9661
9662 pset_unlock(pset);
9663 }
9664 }
9665
9666 foreach_node(node) {
9667 foreach_pset_id(pset_id, node) {
9668 processor_set_t pset = pset_array[pset_id];
9669
9670 pset_lock(pset);
9671
9672 cpumap_t pset_newly_nontemporary = newly_nontemporary_cores & pset->cpu_bitmask;
9673
9674 cpumap_foreach(cpu_id, pset_newly_nontemporary) {
9675 sched_processor_change_mode_locked(processor_array[cpu_id],
9676 PCM_TEMPORARY, false);
9677 }
9678
9679 pset_unlock(pset);
9680 }
9681 }
9682
9683 simple_unlock(&sched_available_cores_lock);
9684 splx(s);
9685
9686 /* Now shutdown not powered cores */
9687 foreach_node(node) {
9688 foreach_pset_id(pset_id, node) {
9689 processor_set_t pset = pset_array[pset_id];
9690
9691 s = splsched();
9692 pset_lock(pset);
9693
9694 cpumap_t pset_newly_offline = newly_offline_cores & pset->cpu_bitmask;
9695 __assert_only cpumap_t pset_powered_cores =
9696 pset->cpu_state_map[PROCESSOR_START] |
9697 pset->cpu_state_map[PROCESSOR_IDLE] |
9698 pset->cpu_state_map[PROCESSOR_DISPATCHING] |
9699 pset->cpu_state_map[PROCESSOR_RUNNING];
9700 assert((pset_powered_cores & pset_newly_offline) == pset_newly_offline);
9701
9702 pset_unlock(pset);
9703 splx(s);
9704
9705 if (pset_newly_offline == 0) {
9706 /* Nothing to do */
9707 continue;
9708 }
9709
9710 cpumap_foreach(cpu_id, pset_newly_offline) {
9711 processor_exit_reason(processor_array[cpu_id], requested_reason, false);
9712 }
9713 }
9714 }
9715
9716 assert(ml_get_interrupts_enabled() == true);
9717
9718 s = splsched();
9719 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
9720
9721 assert(s == caller_s);
9722
9723 pcs.pcs_effective.pcs_powerdown_recommended_cores &= ~newly_derecommended_cores;
9724
9725 sched_update_recommended_cores_locked(requested_reason, 0);
9726
9727 pcs.pcs_previous_reason = requested_reason;
9728
9729 /* All transitions should be quiesced now that we are done changing things */
9730 assert_no_processors_in_transition_locked();
9731
9732 assert3u(pcs.pcs_requested.pcs_online_cores, ==, pcs.pcs_effective.pcs_online_cores);
9733 assert3u(pcs.pcs_requested.pcs_tempdown_cores, ==, pcs.pcs_effective.pcs_tempdown_cores);
9734 assert3u(pcs.pcs_requested.pcs_powerdown_recommended_cores, ==, pcs.pcs_effective.pcs_powerdown_recommended_cores);
9735
9736 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_END, 0, 0, 0, 0);
9737 }
9738
9739 void
thread_set_options(uint32_t thopt)9740 thread_set_options(uint32_t thopt)
9741 {
9742 spl_t x;
9743 thread_t t = current_thread();
9744
9745 x = splsched();
9746 thread_lock(t);
9747
9748 t->options |= thopt;
9749
9750 thread_unlock(t);
9751 splx(x);
9752 }
9753
9754 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)9755 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
9756 {
9757 thread->pending_block_hint = block_hint;
9758 }
9759
9760 uint32_t
qos_max_parallelism(int qos,uint64_t options)9761 qos_max_parallelism(int qos, uint64_t options)
9762 {
9763 return SCHED(qos_max_parallelism)(qos, options);
9764 }
9765
9766 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)9767 sched_qos_max_parallelism(__unused int qos, uint64_t options)
9768 {
9769 host_basic_info_data_t hinfo;
9770 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
9771
9772
9773 /*
9774 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
9775 * implement their own qos_max_parallelism() interfaces.
9776 */
9777 assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
9778
9779 /* Query the machine layer for core information */
9780 __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
9781 (host_info_t)&hinfo, &count);
9782 assert(kret == KERN_SUCCESS);
9783
9784 if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
9785 return hinfo.logical_cpu;
9786 } else {
9787 return hinfo.physical_cpu;
9788 }
9789 }
9790
9791 int sched_allow_NO_SMT_threads = 1;
9792 #if CONFIG_SCHED_SMT
9793 bool
thread_no_smt(thread_t thread)9794 thread_no_smt(thread_t thread)
9795 {
9796 return sched_allow_NO_SMT_threads &&
9797 (thread->bound_processor == PROCESSOR_NULL) &&
9798 ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
9799 }
9800
9801 bool
processor_active_thread_no_smt(processor_t processor)9802 processor_active_thread_no_smt(processor_t processor)
9803 {
9804 return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
9805 }
9806 #endif /* CONFIG_SCHED_SMT */
9807
9808 #if __arm64__
9809
9810 /*
9811 * Set up or replace old timer with new timer
9812 *
9813 * Returns true if canceled old timer, false if it did not
9814 */
9815 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)9816 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
9817 {
9818 /*
9819 * Exchange deadline for new deadline, if old deadline was nonzero,
9820 * then I cancelled the callback, otherwise I didn't
9821 */
9822
9823 return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
9824 relaxed) != 0;
9825 }
9826
9827 /*
9828 * Set global SFI window (in usec)
9829 */
9830 kern_return_t
sched_perfcontrol_sfi_set_window(uint64_t window_usecs)9831 sched_perfcontrol_sfi_set_window(uint64_t window_usecs)
9832 {
9833 kern_return_t ret = KERN_NOT_SUPPORTED;
9834 #if CONFIG_THREAD_GROUPS
9835 if (window_usecs == 0ULL) {
9836 ret = sfi_window_cancel();
9837 } else {
9838 ret = sfi_set_window(window_usecs);
9839 }
9840 #endif // CONFIG_THREAD_GROUPS
9841 return ret;
9842 }
9843
9844 /*
9845 * Set background and maintenance SFI class offtimes
9846 */
9847 kern_return_t
sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)9848 sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)
9849 {
9850 kern_return_t ret = KERN_NOT_SUPPORTED;
9851 #if CONFIG_THREAD_GROUPS
9852 if (offtime_usecs == 0ULL) {
9853 ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE);
9854 ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG);
9855 } else {
9856 ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs);
9857 ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs);
9858 }
9859 #endif // CONFIG_THREAD_GROUPS
9860 return ret;
9861 }
9862
9863 /*
9864 * Set utility SFI class offtime
9865 */
9866 kern_return_t
sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)9867 sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)
9868 {
9869 kern_return_t ret = KERN_NOT_SUPPORTED;
9870 #if CONFIG_THREAD_GROUPS
9871 if (offtime_usecs == 0ULL) {
9872 ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY);
9873 } else {
9874 ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs);
9875 }
9876 #endif // CONFIG_THREAD_GROUPS
9877 return ret;
9878 }
9879
9880 #endif /* __arm64__ */
9881
9882 #if CONFIG_SCHED_EDGE
9883
9884 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
9885
9886 /*
9887 * sched_edge_pset_running_higher_bucket()
9888 *
9889 * Routine to calculate cumulative running counts for each scheduling
9890 * bucket. This effectively lets the load calculation calculate if a
9891 * cluster is running any threads at a QoS lower than the thread being
9892 * migrated etc.
9893 */
9894 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)9895 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
9896 {
9897 bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
9898 bzero(running_higher, sizeof(uint32_t) * TH_BUCKET_SCHED_MAX);
9899
9900 /* Count the running threads per bucket */
9901 for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
9902 sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
9903 /* Don't count idle threads */
9904 if (cpu_bucket < TH_BUCKET_SCHED_MAX) {
9905 running_higher[cpu_bucket]++;
9906 }
9907 }
9908
9909 /* Calculate the cumulative running counts as a prefix sum */
9910 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX - 1; bucket++) {
9911 running_higher[bucket + 1] += running_higher[bucket];
9912 }
9913 }
9914
9915 /*
9916 * sched_update_pset_load_average()
9917 *
9918 * Updates the load average for each sched bucket for a cluster.
9919 * This routine must be called with the pset lock held.
9920 */
9921 void
sched_update_pset_load_average(processor_set_t pset,uint64_t curtime)9922 sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
9923 {
9924 int avail_cpu_count = pset_available_cpu_count(pset);
9925 if (avail_cpu_count == 0) {
9926 /* Looks like the pset is not runnable any more; nothing to do here */
9927 return;
9928 }
9929
9930 /*
9931 * Edge Scheduler Optimization
9932 *
9933 * See if more callers of this routine can pass in timestamps to avoid the
9934 * mach_absolute_time() call here.
9935 */
9936
9937 if (!curtime) {
9938 curtime = mach_absolute_time();
9939 }
9940 uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
9941 int64_t delta_ticks = curtime - last_update;
9942 if (delta_ticks < 0) {
9943 return;
9944 }
9945
9946 uint64_t delta_nsecs = 0;
9947 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
9948
9949 if (__improbable(delta_nsecs > UINT32_MAX)) {
9950 delta_nsecs = UINT32_MAX;
9951 }
9952
9953 /* Update the shared resource load on the pset */
9954 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
9955 uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
9956 uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
9957 uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
9958 uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
9959 if (old_shared_load != new_shared_load) {
9960 KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
9961 }
9962 }
9963
9964 uint32_t running_higher[TH_BUCKET_SCHED_MAX];
9965 sched_edge_pset_running_higher_bucket(pset, running_higher);
9966
9967 for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
9968 uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
9969 uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
9970 uint32_t current_runq_depth = sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket];
9971 os_atomic_store(&pset->pset_runnable_depth[sched_bucket], current_runq_depth, relaxed);
9972
9973 uint32_t current_load = current_runq_depth / avail_cpu_count;
9974 /*
9975 * For the new load average multiply current_load by delta_nsecs (which results in a 32.0 value).
9976 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
9977 * new load average needs to be shifted before it can be added to the old load average.
9978 */
9979 uint64_t new_load_average_factor = (current_load * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
9980
9981 /*
9982 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
9983 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
9984 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
9985 */
9986 int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
9987 boolean_t load_uptick = (old_load_shifted == 0) && (current_load != 0);
9988 boolean_t load_downtick = (old_load_shifted != 0) && (current_load == 0);
9989 uint64_t load_average;
9990 if (load_uptick || load_downtick) {
9991 load_average = (current_load << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
9992 } else {
9993 /* Indicates a loaded system; use EWMA for load average calculation */
9994 load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
9995 }
9996 os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
9997 if (load_average != old_load_average) {
9998 KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
9999 }
10000 }
10001 os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
10002 }
10003
10004 void
sched_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)10005 sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
10006 {
10007 pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
10008 uint64_t avg_thread_execution_time = 0;
10009
10010 os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
10011 old_execution_time_packed.pset_execution_time_packed,
10012 new_execution_time_packed.pset_execution_time_packed, relaxed, {
10013 uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
10014 int64_t delta_ticks = curtime - last_update;
10015 if (delta_ticks <= 0) {
10016 /*
10017 * Its possible that another CPU came in and updated the pset_execution_time
10018 * before this CPU could do it. Since the average execution time is meant to
10019 * be an approximate measure per cluster, ignore the older update.
10020 */
10021 os_atomic_rmw_loop_give_up(return );
10022 }
10023 uint64_t delta_nsecs = 0;
10024 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
10025
10026 uint64_t nanotime = 0;
10027 absolutetime_to_nanoseconds(execution_time, &nanotime);
10028 uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
10029
10030 /*
10031 * Since the average execution time is stored in microseconds, avoid rounding errors in
10032 * the EWMA calculation by only using a non-zero previous value.
10033 */
10034 uint64_t old_avg_thread_execution_time = MAX(old_execution_time_packed.pset_avg_thread_execution_time, 1ULL);
10035
10036 uint64_t old_execution_time = (old_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
10037 uint64_t new_execution_time = (execution_time_us * delta_nsecs);
10038
10039 avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
10040 new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
10041 new_execution_time_packed.pset_execution_time_last_update = curtime;
10042 });
10043 if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
10044 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
10045 }
10046 }
10047
10048 uint64_t
sched_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)10049 sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
10050 {
10051 /* Prevent migrations to derecommended clusters */
10052 if (!pset_is_recommended(pset)) {
10053 return UINT64_MAX;
10054 }
10055 return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
10056 }
10057
10058 #else /* CONFIG_SCHED_EDGE */
10059
10060 void
sched_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)10061 sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
10062 {
10063 int non_rt_load = pset->pset_runq.count;
10064 int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
10065 int new_load_average = ((int)pset->load_average + load) >> 1;
10066
10067 pset->load_average = new_load_average;
10068 #if (DEVELOPMENT || DEBUG)
10069 #if __AMP__
10070 if (pset->pset_cluster_type == PSET_AMP_P) {
10071 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
10072 }
10073 #endif
10074 #endif
10075 }
10076
10077 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)10078 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
10079 {
10080 }
10081
10082 #endif /* CONFIG_SCHED_EDGE */
10083
10084 /* pset is locked */
10085 static bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)10086 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
10087 {
10088 int cpuid = processor->cpu_id;
10089 #if defined(__x86_64__)
10090 if (sched_avoid_cpu0 && (cpuid == 0)) {
10091 return false;
10092 }
10093 #endif
10094
10095 cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
10096
10097 return bit_test(fasttrack_map, cpuid);
10098 }
10099
10100 #if CONFIG_SCHED_SMT
10101 /* pset is locked */
10102 static processor_t
choose_processor_for_realtime_thread_smt(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)10103 choose_processor_for_realtime_thread_smt(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
10104 {
10105 #if defined(__x86_64__)
10106 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
10107 #else
10108 const bool avoid_cpu0 = false;
10109 #endif
10110 cpumap_t cpu_map;
10111
10112 try_again:
10113 cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
10114 if (skip_processor) {
10115 bit_clear(cpu_map, skip_processor->cpu_id);
10116 }
10117 if (skip_spills) {
10118 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
10119 }
10120
10121 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
10122 bit_clear(cpu_map, 0);
10123 }
10124
10125 cpumap_t primary_map = cpu_map & pset->primary_map;
10126 if (avoid_cpu0) {
10127 primary_map = bit_ror64(primary_map, 1);
10128 }
10129
10130 int rotid = lsb_first(primary_map);
10131 if (rotid >= 0) {
10132 int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
10133
10134 processor_t processor = processor_array[cpuid];
10135
10136 return processor;
10137 }
10138
10139 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
10140 goto out;
10141 }
10142
10143 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
10144 /* Also avoid cpu1 */
10145 bit_clear(cpu_map, 1);
10146 }
10147
10148 /* Consider secondary processors whose primary is actually running a realtime thread */
10149 cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
10150 if (avoid_cpu0) {
10151 /* Also avoid cpu1 */
10152 secondary_map = bit_ror64(secondary_map, 2);
10153 }
10154 rotid = lsb_first(secondary_map);
10155 if (rotid >= 0) {
10156 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
10157
10158 processor_t processor = processor_array[cpuid];
10159
10160 return processor;
10161 }
10162
10163 /* Consider secondary processors */
10164 secondary_map = cpu_map & ~pset->primary_map;
10165 if (avoid_cpu0) {
10166 /* Also avoid cpu1 */
10167 secondary_map = bit_ror64(secondary_map, 2);
10168 }
10169 rotid = lsb_first(secondary_map);
10170 if (rotid >= 0) {
10171 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
10172
10173 processor_t processor = processor_array[cpuid];
10174
10175 return processor;
10176 }
10177
10178 /*
10179 * I was hoping the compiler would optimize
10180 * this away when avoid_cpu0 is const bool false
10181 * but it still complains about the assignmnent
10182 * in that case.
10183 */
10184 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
10185 #if defined(__x86_64__)
10186 avoid_cpu0 = false;
10187 #else
10188 assert(0);
10189 #endif
10190 goto try_again;
10191 }
10192
10193 out:
10194 if (skip_processor) {
10195 return PROCESSOR_NULL;
10196 }
10197
10198 /*
10199 * If we didn't find an obvious processor to choose, but there are still more CPUs
10200 * not already running realtime threads than realtime threads in the realtime run queue,
10201 * this thread belongs in this pset, so choose some other processor in this pset
10202 * to ensure the thread is enqueued here.
10203 */
10204 cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
10205 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
10206 cpu_map = non_realtime_map;
10207 assert(cpu_map != 0);
10208 int cpuid = bit_first(cpu_map);
10209 assert(cpuid >= 0);
10210 return processor_array[cpuid];
10211 }
10212
10213 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
10214 goto skip_secondaries;
10215 }
10216
10217 non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
10218 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
10219 cpu_map = non_realtime_map;
10220 assert(cpu_map != 0);
10221 int cpuid = bit_first(cpu_map);
10222 assert(cpuid >= 0);
10223 return processor_array[cpuid];
10224 }
10225
10226 skip_secondaries:
10227 return PROCESSOR_NULL;
10228 }
10229 #else /* CONFIG_SCHED_SMT*/
10230 /* pset is locked */
10231 static processor_t
choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool skip_spills)10232 choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills)
10233 {
10234 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
10235 if (skip_processor) {
10236 bit_clear(cpu_map, skip_processor->cpu_id);
10237 }
10238 if (skip_spills) {
10239 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
10240 }
10241
10242 int rotid = lsb_first(cpu_map);
10243 if (rotid >= 0) {
10244 return processor_array[rotid];
10245 }
10246
10247 /*
10248 * If we didn't find an obvious processor to choose, but there are still more CPUs
10249 * not already running realtime threads than realtime threads in the realtime run queue,
10250 * this thread belongs in this pset, so choose some other processor in this pset
10251 * to ensure the thread is enqueued here.
10252 */
10253 cpumap_t non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
10254 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
10255 cpu_map = non_realtime_map;
10256 assert(cpu_map != 0);
10257 int cpuid = bit_first(cpu_map);
10258 assert(cpuid >= 0);
10259 return processor_array[cpuid];
10260 }
10261
10262 return PROCESSOR_NULL;
10263 }
10264 #endif /* CONFIG_SCHED_SMT */
10265
10266 /*
10267 * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
10268 * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
10269 *
10270 * pset is locked.
10271 */
10272 static processor_t
choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)10273 choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
10274 {
10275 uint64_t furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon);
10276 processor_t fd_processor = PROCESSOR_NULL;
10277 int lowest_priority = max_pri;
10278
10279 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
10280 if (skip_processor) {
10281 bit_clear(cpu_map, skip_processor->cpu_id);
10282 }
10283 if (skip_spills) {
10284 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
10285 }
10286
10287 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
10288 processor_t processor = processor_array[cpuid];
10289
10290 if (processor->current_pri > lowest_priority) {
10291 continue;
10292 }
10293
10294 if (processor->current_pri < lowest_priority) {
10295 lowest_priority = processor->current_pri;
10296 furthest_deadline = processor->deadline;
10297 fd_processor = processor;
10298 continue;
10299 }
10300
10301 if (processor->deadline > furthest_deadline) {
10302 furthest_deadline = processor->deadline;
10303 fd_processor = processor;
10304 }
10305 }
10306
10307 if (fd_processor) {
10308 return fd_processor;
10309 }
10310
10311 /*
10312 * There is a race condition possible when there are multiple processor sets.
10313 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
10314 * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
10315 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
10316 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
10317 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
10318 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
10319 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
10320 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
10321 *
10322 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
10323 * on the run queue of that pset.
10324 */
10325 if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
10326 cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
10327 assert(skip_processor == PROCESSOR_NULL);
10328 assert(skip_spills == false);
10329
10330 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
10331 processor_t processor = processor_array[cpuid];
10332
10333 if (processor->current_pri > lowest_priority) {
10334 continue;
10335 }
10336
10337 if (processor->current_pri < lowest_priority) {
10338 lowest_priority = processor->current_pri;
10339 furthest_deadline = processor->deadline;
10340 fd_processor = processor;
10341 continue;
10342 }
10343
10344 if (processor->deadline > furthest_deadline) {
10345 furthest_deadline = processor->deadline;
10346 fd_processor = processor;
10347 }
10348 }
10349 }
10350
10351 return fd_processor;
10352 }
10353
10354 /* pset is locked */
10355 static processor_t
choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)10356 choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
10357 {
10358 (void) consider_secondaries;
10359 bool skip_spills = true;
10360 bool include_ast_urgent_pending_cpus = false;
10361
10362 #if CONFIG_SCHED_SMT
10363 processor_t next_processor = choose_processor_for_realtime_thread_smt(pset, skip_processor, consider_secondaries, skip_spills);
10364 #else /* CONFIG_SCHED_SMT */
10365 processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, skip_spills);
10366 #endif /* CONFIG_SCHED_SMT */
10367 if (next_processor != PROCESSOR_NULL) {
10368 return next_processor;
10369 }
10370
10371 next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
10372 return next_processor;
10373 }
10374
10375 #if CONFIG_SCHED_SMT
10376 /* pset is locked */
10377 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)10378 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
10379 {
10380 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
10381 int nbackup_cpus = 0;
10382
10383 if (include_backups && rt_runq_is_low_latency(pset)) {
10384 nbackup_cpus = sched_rt_n_backup_processors;
10385 }
10386
10387 cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
10388 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
10389 bit_clear(cpu_map, 0);
10390 }
10391 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
10392 }
10393
10394 /* pset is locked */
10395 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)10396 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
10397 {
10398 int nbackup_cpus = 0;
10399
10400 if (include_backups && rt_runq_is_low_latency(pset)) {
10401 nbackup_cpus = sched_rt_n_backup_processors;
10402 }
10403
10404 cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
10405 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
10406 }
10407 #endif /* CONFIG_SCHED_SMT */
10408
10409 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)10410 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
10411 {
10412 if (!processor->is_recommended) {
10413 return false;
10414 }
10415 bool ok_to_run_realtime_thread = true;
10416 #if CONFIG_SCHED_SMT
10417 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
10418 if (spill_pending) {
10419 return true;
10420 }
10421 if (processor->cpu_id == 0) {
10422 if (sched_avoid_cpu0 == 1) {
10423 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
10424 } else if (sched_avoid_cpu0 == 2) {
10425 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
10426 }
10427 } else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
10428 ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
10429 } else if (processor->processor_primary != processor) {
10430 ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
10431 }
10432 #else /* CONFIG_SCHED_SMT */
10433 (void)pset;
10434 (void)processor;
10435 (void)as_backup;
10436 #endif /* CONFIG_SCHED_SMT */
10437 return ok_to_run_realtime_thread;
10438 }
10439
10440 void
sched_pset_made_schedulable(__unused processor_t processor,processor_set_t pset,boolean_t drop_lock)10441 sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
10442 {
10443 if (drop_lock) {
10444 pset_unlock(pset);
10445 }
10446 }
10447
10448 #if defined(__x86_64__)
10449 void
thread_set_no_smt(bool set)10450 thread_set_no_smt(bool set)
10451 {
10452 (void) set;
10453 #if CONFIG_SCHED_SMT
10454 if (!system_is_SMT) {
10455 /* Not a machine that supports SMT */
10456 return;
10457 }
10458
10459 thread_t thread = current_thread();
10460
10461 spl_t s = splsched();
10462 thread_lock(thread);
10463 if (set) {
10464 thread->sched_flags |= TH_SFLAG_NO_SMT;
10465 }
10466 thread_unlock(thread);
10467 splx(s);
10468 #endif /* CONFIG_SCHED_SMT */
10469 }
10470 #endif /* __x86_64__ */
10471
10472
10473 #if CONFIG_SCHED_SMT
10474 bool
thread_get_no_smt(void)10475 thread_get_no_smt(void)
10476 {
10477 return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
10478 }
10479
10480 extern void task_set_no_smt(task_t);
10481 void
task_set_no_smt(task_t task)10482 task_set_no_smt(task_t task)
10483 {
10484 if (!system_is_SMT) {
10485 /* Not a machine that supports SMT */
10486 return;
10487 }
10488
10489 if (task == TASK_NULL) {
10490 task = current_task();
10491 }
10492
10493 task_lock(task);
10494 task->t_flags |= TF_NO_SMT;
10495 task_unlock(task);
10496 }
10497
10498 #if DEBUG || DEVELOPMENT
10499 extern void sysctl_task_set_no_smt(char no_smt);
10500 void
sysctl_task_set_no_smt(char no_smt)10501 sysctl_task_set_no_smt(char no_smt)
10502 {
10503 if (!system_is_SMT) {
10504 /* Not a machine that supports SMT */
10505 return;
10506 }
10507
10508 task_t task = current_task();
10509
10510 task_lock(task);
10511 if (no_smt == '1') {
10512 task->t_flags |= TF_NO_SMT;
10513 }
10514 task_unlock(task);
10515 }
10516
10517 extern char sysctl_task_get_no_smt(void);
10518 char
sysctl_task_get_no_smt(void)10519 sysctl_task_get_no_smt(void)
10520 {
10521 task_t task = current_task();
10522
10523 if (task->t_flags & TF_NO_SMT) {
10524 return '1';
10525 }
10526 return '0';
10527 }
10528 #endif /* DEVELOPMENT || DEBUG */
10529 #else /* CONFIG_SCHED_SMT */
10530
10531 extern void task_set_no_smt(task_t);
10532 void
task_set_no_smt(__unused task_t task)10533 task_set_no_smt(__unused task_t task)
10534 {
10535 return;
10536 }
10537
10538 #if DEBUG || DEVELOPMENT
10539 extern void sysctl_task_set_no_smt(char no_smt);
10540 void
sysctl_task_set_no_smt(__unused char no_smt)10541 sysctl_task_set_no_smt(__unused char no_smt)
10542 {
10543 return;
10544 }
10545
10546 extern char sysctl_task_get_no_smt(void);
10547 char
sysctl_task_get_no_smt(void)10548 sysctl_task_get_no_smt(void)
10549 {
10550 return '1';
10551 }
10552 #endif /* DEBUG || DEVELOPMENT */
10553 #endif /* CONFIG_SCHED_SMT */
10554
10555 __private_extern__ void
thread_soft_bind_cluster_type(thread_t thread,char cluster_type)10556 thread_soft_bind_cluster_type(thread_t thread, char cluster_type)
10557 {
10558 #if __AMP__
10559 spl_t s = splsched();
10560 thread_lock(thread);
10561 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
10562 pset_node_t bind_node = PSET_NODE_NULL;
10563 switch (cluster_type) {
10564 case 'e':
10565 case 'E':
10566 if (ecore_node->psets != PROCESSOR_SET_NULL) {
10567 bind_node = ecore_node;
10568 }
10569 break;
10570 case 'p':
10571 case 'P':
10572 if (pcore_node->psets != PROCESSOR_SET_NULL) {
10573 bind_node = pcore_node;
10574 }
10575 break;
10576 default:
10577 break;
10578 }
10579 if (bind_node != PSET_NODE_NULL) {
10580 thread->th_bound_cluster_id = bind_node->psets->pset_id;
10581 }
10582 thread_unlock(thread);
10583 splx(s);
10584
10585 if (thread == current_thread()) {
10586 /* Trigger a context-switch to get on the newly bound cluster */
10587 thread_block(THREAD_CONTINUE_NULL);
10588 }
10589 #else /* __AMP__ */
10590 (void)thread;
10591 (void)cluster_type;
10592 #endif /* __AMP__ */
10593 }
10594
10595 extern uint32_t thread_bound_cluster_id(thread_t thread);
10596 uint32_t
thread_bound_cluster_id(thread_t thread)10597 thread_bound_cluster_id(thread_t thread)
10598 {
10599 return thread->th_bound_cluster_id;
10600 }
10601
10602 __private_extern__ kern_return_t
thread_soft_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)10603 thread_soft_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
10604 {
10605 #if __AMP__
10606 if (cluster_id == THREAD_BOUND_CLUSTER_NONE) {
10607 /* Treat binding to THREAD_BOUND_CLUSTER_NONE as a request to unbind. */
10608 options |= THREAD_UNBIND;
10609 }
10610
10611 if (options & THREAD_UNBIND) {
10612 cluster_id = THREAD_BOUND_CLUSTER_NONE;
10613 } else {
10614 /* Validate the specified cluster id */
10615 int max_clusters = ml_get_cluster_count();
10616 if (cluster_id >= max_clusters) {
10617 /* Invalid cluster id */
10618 return KERN_INVALID_VALUE;
10619 }
10620 processor_set_t pset = pset_array[cluster_id];
10621 if (pset == NULL) {
10622 /* Cluster has not finished initializing at boot */
10623 return KERN_FAILURE;
10624 }
10625 if (options & THREAD_BIND_ELIGIBLE_ONLY) {
10626 if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
10627 /* Thread is not recommended for the cluster type */
10628 return KERN_INVALID_POLICY;
10629 }
10630 }
10631 }
10632
10633 spl_t s = splsched();
10634 thread_lock(thread);
10635
10636 thread->th_bound_cluster_id = cluster_id;
10637
10638 thread_unlock(thread);
10639 splx(s);
10640
10641 if (thread == current_thread()) {
10642 /* Trigger a context-switch to get on the newly bound cluster */
10643 thread_block(THREAD_CONTINUE_NULL);
10644 }
10645 #else /* __AMP__ */
10646 (void)thread;
10647 (void)cluster_id;
10648 (void)options;
10649 #endif /* __AMP__ */
10650 return KERN_SUCCESS;
10651 }
10652
10653 #if DEVELOPMENT || DEBUG
10654 extern int32_t sysctl_get_bound_cpuid(void);
10655 int32_t
sysctl_get_bound_cpuid(void)10656 sysctl_get_bound_cpuid(void)
10657 {
10658 int32_t cpuid = -1;
10659 thread_t self = current_thread();
10660
10661 processor_t processor = self->bound_processor;
10662 if (processor == NULL) {
10663 cpuid = -1;
10664 } else {
10665 cpuid = processor->cpu_id;
10666 }
10667
10668 return cpuid;
10669 }
10670
10671 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
10672 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)10673 sysctl_thread_bind_cpuid(int32_t cpuid)
10674 {
10675 processor_t processor = PROCESSOR_NULL;
10676
10677 if (cpuid == -1) {
10678 goto unbind;
10679 }
10680
10681 if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
10682 return KERN_INVALID_VALUE;
10683 }
10684
10685 processor = processor_array[cpuid];
10686 if (processor == PROCESSOR_NULL) {
10687 return KERN_INVALID_VALUE;
10688 }
10689
10690 unbind:
10691 thread_bind(processor);
10692
10693 thread_block(THREAD_CONTINUE_NULL);
10694 return KERN_SUCCESS;
10695 }
10696
10697 #if __AMP__
10698 static char
pset_cluster_type_name_char(pset_cluster_type_t pset_type)10699 pset_cluster_type_name_char(pset_cluster_type_t pset_type)
10700 {
10701 switch (pset_type) {
10702 case PSET_AMP_E:
10703 return 'E';
10704 case PSET_AMP_P:
10705 return 'P';
10706 default:
10707 panic("Unexpected AMP pset cluster type %d", pset_type);
10708 }
10709 }
10710 #endif /* __AMP__ */
10711
10712 extern char sysctl_get_task_cluster_type(void);
10713 char
sysctl_get_task_cluster_type(void)10714 sysctl_get_task_cluster_type(void)
10715 {
10716 #if __AMP__
10717 task_t task = current_task();
10718 processor_set_t pset_hint = task->pset_hint;
10719
10720 if (!pset_hint) {
10721 return '0';
10722 }
10723 return pset_cluster_type_name_char(pset_hint->pset_cluster_type);
10724 #else /* !__AMP__ */
10725 return '0';
10726 #endif /* __AMP__ */
10727 }
10728
10729 #if __AMP__
10730 extern char sysctl_get_bound_cluster_type(void);
10731 char
sysctl_get_bound_cluster_type(void)10732 sysctl_get_bound_cluster_type(void)
10733 {
10734 thread_t self = current_thread();
10735
10736 if (self->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) {
10737 return '0';
10738 }
10739 pset_cluster_type_t pset_type = pset_array[self->th_bound_cluster_id]->pset_cluster_type;
10740 return pset_cluster_type_name_char(pset_type);
10741 }
10742
10743 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)10744 find_pset_of_type(pset_cluster_type_t t)
10745 {
10746 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
10747 if (node->pset_cluster_type != t) {
10748 continue;
10749 }
10750
10751 processor_set_t pset = PROCESSOR_SET_NULL;
10752 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
10753 pset = pset_array[pset_id];
10754 /* Prefer one with recommended processsors */
10755 if (pset_is_recommended(pset)) {
10756 assert(pset->pset_cluster_type == t);
10757 return pset;
10758 }
10759 }
10760 /* Otherwise return whatever was found last */
10761 return pset;
10762 }
10763
10764 return PROCESSOR_SET_NULL;
10765 }
10766 #endif /* __AMP__ */
10767
10768 extern void sysctl_task_set_cluster_type(char cluster_type);
10769 void
sysctl_task_set_cluster_type(char cluster_type)10770 sysctl_task_set_cluster_type(char cluster_type)
10771 {
10772 task_t task = current_task();
10773 processor_set_t pset_hint = PROCESSOR_SET_NULL;
10774
10775 #if __AMP__
10776 switch (cluster_type) {
10777 case 'e':
10778 case 'E':
10779 pset_hint = find_pset_of_type(PSET_AMP_E);
10780 break;
10781 case 'p':
10782 case 'P':
10783 pset_hint = find_pset_of_type(PSET_AMP_P);
10784 break;
10785 default:
10786 break;
10787 }
10788
10789 if (pset_hint) {
10790 task_lock(task);
10791 task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
10792 task->pset_hint = pset_hint;
10793 task_unlock(task);
10794
10795 thread_block(THREAD_CONTINUE_NULL);
10796 }
10797 #else
10798 (void)cluster_type;
10799 (void)task;
10800 (void)pset_hint;
10801 #endif
10802 }
10803
10804 #endif /* DEVELOPMENT || DEBUG */
10805