1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_FREE_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67 #include <debug.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80
81 #include <machine/commpage.h>
82
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/debug.h>
90 #include <kern/macro_help.h>
91 #include <kern/machine.h>
92 #include <kern/misc_protos.h>
93 #if MONOTONIC
94 #include <kern/monotonic.h>
95 #endif /* MONOTONIC */
96 #include <kern/processor.h>
97 #include <kern/queue.h>
98 #include <kern/recount.h>
99 #include <kern/restartable.h>
100 #include <kern/sched.h>
101 #include <kern/sched_prim.h>
102 #include <kern/sfi.h>
103 #include <kern/syscall_subr.h>
104 #include <kern/task.h>
105 #include <kern/thread.h>
106 #include <kern/thread_group.h>
107 #include <kern/ledger.h>
108 #include <kern/timer_queue.h>
109 #include <kern/waitq.h>
110 #include <kern/policy_internal.h>
111
112 #include <vm/pmap.h>
113 #include <vm/vm_kern.h>
114 #include <vm/vm_map.h>
115 #include <vm/vm_pageout.h>
116
117 #include <mach/sdt.h>
118 #include <mach/mach_host.h>
119 #include <mach/host_info.h>
120
121 #include <sys/kdebug.h>
122 #include <kperf/kperf.h>
123 #include <kern/kpc.h>
124 #include <san/kasan.h>
125 #include <kern/pms.h>
126 #include <kern/host.h>
127 #include <stdatomic.h>
128 #include <os/atomic_private.h>
129
130 #ifdef KDBG_MACOS_RELEASE
131 #define KTRC KDBG_MACOS_RELEASE
132 #else
133 #define KTRC KDBG_RELEASE
134 #endif
135
136 struct sched_statistics PERCPU_DATA(sched_stats);
137 bool sched_stats_active;
138
139 static uint64_t
deadline_add(uint64_t d,uint64_t e)140 deadline_add(uint64_t d, uint64_t e)
141 {
142 uint64_t sum;
143 return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
144 }
145
146 int
rt_runq_count(processor_set_t pset)147 rt_runq_count(processor_set_t pset)
148 {
149 return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
150 }
151
152 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)153 rt_runq_earliest_deadline(processor_set_t pset)
154 {
155 return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
156 }
157
158 static int
rt_runq_priority(processor_set_t pset)159 rt_runq_priority(processor_set_t pset)
160 {
161 pset_assert_locked(pset);
162 rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
163
164 bitmap_t *map = rt_run_queue->bitmap;
165 int i = bitmap_first(map, NRTQS);
166 assert(i < NRTQS);
167
168 if (i >= 0) {
169 return i + BASEPRI_RTQUEUES;
170 }
171
172 return i;
173 }
174
175 static thread_t rt_runq_first(rt_queue_t rt_runq);
176
177 #if DEBUG
178 static void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)179 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
180 {
181 bitmap_t *map = rt_run_queue->bitmap;
182
183 uint64_t earliest_deadline = RT_DEADLINE_NONE;
184 uint32_t constraint = RT_CONSTRAINT_NONE;
185 int ed_index = NOPRI;
186 int count = 0;
187 bool found_thread = false;
188
189 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
190 int i = pri - BASEPRI_RTQUEUES;
191 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
192 queue_t queue = &rt_runq->pri_queue;
193 queue_entry_t iter;
194 int n = 0;
195 uint64_t previous_deadline = 0;
196 qe_foreach(iter, queue) {
197 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
198 assert_thread_magic(iter_thread);
199 if (iter_thread == thread) {
200 found_thread = true;
201 }
202 assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
203 assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
204 assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
205 assert(previous_deadline <= iter_thread->realtime.deadline);
206 n++;
207 if (iter == queue_first(queue)) {
208 assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
209 assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
210 }
211 previous_deadline = iter_thread->realtime.deadline;
212 }
213 assert(n == rt_runq->pri_count);
214 if (n == 0) {
215 assert(bitmap_test(map, i) == false);
216 assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
217 assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
218 } else {
219 assert(bitmap_test(map, i) == true);
220 }
221 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
222 earliest_deadline = rt_runq->pri_earliest_deadline;
223 constraint = rt_runq->pri_constraint;
224 ed_index = i;
225 }
226 count += n;
227 }
228 assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
229 assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
230 assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
231 assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
232 if (thread) {
233 assert(found_thread);
234 }
235 }
236 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th)
237 #else
238 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0)
239 #endif
240
241 uint32_t rt_constraint_threshold;
242
243 static bool
rt_runq_is_low_latency(processor_set_t pset)244 rt_runq_is_low_latency(processor_set_t pset)
245 {
246 return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
247 }
248
249 TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true);
250
251 /* TODO: enable this, to 50us (less than the deferred IPI latency, to beat a spill) */
252 TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 0); /* microseconds */
253 static uint64_t nonurgent_preemption_timer_abs = 0;
254
255 #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
256 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
257
258 #define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
259 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
260
261 #define MAX_UNSAFE_RT_QUANTA 100
262 #define SAFE_RT_MULTIPLIER 2
263
264 #define MAX_UNSAFE_FIXED_QUANTA 100
265 #define SAFE_FIXED_MULTIPLIER 2
266
267 TUNABLE_DEV_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
268 TUNABLE_DEV_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
269
270 TUNABLE_DEV_WRITEABLE(int, safe_rt_multiplier, "safe_rt_multiplier", SAFE_RT_MULTIPLIER);
271 TUNABLE_DEV_WRITEABLE(int, safe_fixed_multiplier, "safe_fixed_multiplier", SAFE_RT_MULTIPLIER);
272
273 #define MAX_POLL_QUANTA 2
274 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
275
276 #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
277 int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
278
279 uint64_t max_poll_computation;
280
281 uint64_t max_unsafe_rt_computation;
282 uint64_t max_unsafe_fixed_computation;
283 uint64_t sched_safe_rt_duration;
284 uint64_t sched_safe_fixed_duration;
285
286 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
287
288 uint32_t std_quantum;
289 uint32_t min_std_quantum;
290 uint32_t bg_quantum;
291
292 uint32_t std_quantum_us;
293 uint32_t bg_quantum_us;
294
295 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
296
297 uint32_t thread_depress_time;
298 uint32_t default_timeshare_computation;
299 uint32_t default_timeshare_constraint;
300
301 uint32_t max_rt_quantum;
302 uint32_t min_rt_quantum;
303
304 uint32_t rt_deadline_epsilon;
305
306 uint32_t rt_constraint_threshold;
307
308 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
309
310 unsigned sched_tick;
311 uint32_t sched_tick_interval;
312
313 /* Timeshare load calculation interval (15ms) */
314 uint32_t sched_load_compute_interval_us = 15000;
315 uint64_t sched_load_compute_interval_abs;
316 static _Atomic uint64_t sched_load_compute_deadline;
317
318 uint32_t sched_pri_shifts[TH_BUCKET_MAX];
319 uint32_t sched_fixed_shift;
320
321 uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
322
323 /* Allow foreground to decay past default to resolve inversions */
324 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
325 int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
326
327 /* Defaults for timer deadline profiling */
328 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
329 * 2ms */
330 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
331 * <= 5ms */
332
333 uint64_t timer_deadline_tracking_bin_1;
334 uint64_t timer_deadline_tracking_bin_2;
335
336 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
337
338 thread_t sched_maintenance_thread;
339
340 /* interrupts disabled lock to guard recommended cores state */
341 decl_simple_lock_data(, sched_available_cores_lock);
342 uint64_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
343 uint64_t perfcontrol_system_requested_recommended_cores = ALL_CORES_RECOMMENDED;
344 uint64_t perfcontrol_user_requested_recommended_cores = ALL_CORES_RECOMMENDED;
345 static uint64_t usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
346 static uint64_t sched_online_processors = 0;
347 static void sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags);
348 static void sched_update_powered_cores(uint64_t reqested_powered_cores, processor_reason_t reason, uint32_t flags);
349
350 #if __arm64__
351 static void sched_recommended_cores_maintenance(void);
352 uint64_t perfcontrol_failsafe_starvation_threshold;
353 extern char *proc_name_address(struct proc *p);
354 #endif /* __arm64__ */
355
356 uint64_t sched_one_second_interval;
357 boolean_t allow_direct_handoff = TRUE;
358
359 /* Forwards */
360
361 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
362
363 static void load_shift_init(void);
364 static void preempt_pri_init(void);
365
366 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
367
368 thread_t processor_idle(
369 thread_t thread,
370 processor_t processor);
371
372 static ast_t
373 csw_check_locked(
374 thread_t thread,
375 processor_t processor,
376 processor_set_t pset,
377 ast_t check_reason);
378
379 static void processor_setrun(
380 processor_t processor,
381 thread_t thread,
382 integer_t options);
383
384 static void
385 sched_realtime_timebase_init(void);
386
387 static void
388 sched_timer_deadline_tracking_init(void);
389
390 #if DEBUG
391 extern int debug_task;
392 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
393 #else
394 #define TLOG(a, fmt, args...) do {} while (0)
395 #endif
396
397 static processor_t
398 thread_bind_internal(
399 thread_t thread,
400 processor_t processor);
401
402 static void
403 sched_vm_group_maintenance(void);
404
405 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
406 int8_t sched_load_shifts[NRQS];
407 bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
408 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
409
410 /*
411 * Statically allocate a buffer to hold the longest possible
412 * scheduler description string, as currently implemented.
413 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
414 * to export to userspace via sysctl(3). If either version
415 * changes, update the other.
416 *
417 * Note that in addition to being an upper bound on the strings
418 * in the kernel, it's also an exact parameter to PE_get_default(),
419 * which interrogates the device tree on some platforms. That
420 * API requires the caller know the exact size of the device tree
421 * property, so we need both a legacy size (32) and the current size
422 * (48) to deal with old and new device trees. The device tree property
423 * is similarly padded to a fixed size so that the same kernel image
424 * can run on multiple devices with different schedulers configured
425 * in the device tree.
426 */
427 char sched_string[SCHED_STRING_MAX_LENGTH];
428
429 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
430
431 /* Global flag which indicates whether Background Stepper Context is enabled */
432 static int cpu_throttle_enabled = 1;
433
434 #if DEVELOPMENT || DEBUG
435 int enable_task_set_cluster_type = 0;
436 bool system_ecore_only = false;
437 #endif /* DEVELOPMENT || DEBUG */
438
439 void
sched_init(void)440 sched_init(void)
441 {
442 boolean_t direct_handoff = FALSE;
443 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
444
445 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
446 /* No boot-args, check in device tree */
447 if (!PE_get_default("kern.sched_pri_decay_limit",
448 &sched_pri_decay_band_limit,
449 sizeof(sched_pri_decay_band_limit))) {
450 /* Allow decay all the way to normal limits */
451 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
452 }
453 }
454
455 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
456
457 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
458 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
459 }
460 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
461
462 #if __arm64__
463 clock_interval_to_absolutetime_interval(expecting_ipi_wfe_timeout_usec, NSEC_PER_USEC, &expecting_ipi_wfe_timeout_mt);
464 #endif /* __arm64__ */
465
466 SCHED(init)();
467 SCHED(rt_init)(&pset0);
468 sched_timer_deadline_tracking_init();
469
470 SCHED(pset_init)(&pset0);
471 SCHED(processor_init)(master_processor);
472
473 if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
474 allow_direct_handoff = direct_handoff;
475 }
476
477 #if DEVELOPMENT || DEBUG
478 if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
479 system_ecore_only = (enable_task_set_cluster_type == 2);
480 }
481 #endif /* DEVELOPMENT || DEBUG */
482
483 simple_lock_init(&sched_available_cores_lock, 0);
484 }
485
486 void
sched_timebase_init(void)487 sched_timebase_init(void)
488 {
489 uint64_t abstime;
490
491 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
492 sched_one_second_interval = abstime;
493
494 SCHED(timebase_init)();
495 sched_realtime_timebase_init();
496 }
497
498 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
499
500 void
sched_timeshare_init(void)501 sched_timeshare_init(void)
502 {
503 /*
504 * Calculate the timeslicing quantum
505 * in us.
506 */
507 if (default_preemption_rate < 1) {
508 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
509 }
510 std_quantum_us = (1000 * 1000) / default_preemption_rate;
511
512 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
513
514 if (default_bg_preemption_rate < 1) {
515 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
516 }
517 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
518
519 printf("standard background quantum is %d us\n", bg_quantum_us);
520
521 load_shift_init();
522 preempt_pri_init();
523 sched_tick = 0;
524 }
525
526 void
sched_set_max_unsafe_rt_quanta(int max)527 sched_set_max_unsafe_rt_quanta(int max)
528 {
529 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
530
531 max_unsafe_rt_computation = ((uint64_t)max) * quantum_size;
532
533 const int mult = safe_rt_multiplier <= 0 ? 2 : safe_rt_multiplier;
534 sched_safe_rt_duration = mult * ((uint64_t)max) * quantum_size;
535
536
537 #if DEVELOPMENT || DEBUG
538 max_unsafe_rt_quanta = max;
539 #else
540 /*
541 * On RELEASE kernels, this is only called on boot where
542 * max is already equal to max_unsafe_rt_quanta.
543 */
544 assert3s(max, ==, max_unsafe_rt_quanta);
545 #endif
546 }
547
548 void
sched_set_max_unsafe_fixed_quanta(int max)549 sched_set_max_unsafe_fixed_quanta(int max)
550 {
551 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
552
553 max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size;
554
555 const int mult = safe_fixed_multiplier <= 0 ? 2 : safe_fixed_multiplier;
556 sched_safe_fixed_duration = mult * ((uint64_t)max) * quantum_size;
557
558 #if DEVELOPMENT || DEBUG
559 max_unsafe_fixed_quanta = max;
560 #else
561 /*
562 * On RELEASE kernels, this is only called on boot where
563 * max is already equal to max_unsafe_fixed_quanta.
564 */
565 assert3s(max, ==, max_unsafe_fixed_quanta);
566 #endif
567 }
568
569 void
sched_timeshare_timebase_init(void)570 sched_timeshare_timebase_init(void)
571 {
572 uint64_t abstime;
573 uint32_t shift;
574
575 /* standard timeslicing quantum */
576 clock_interval_to_absolutetime_interval(
577 std_quantum_us, NSEC_PER_USEC, &abstime);
578 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
579 std_quantum = (uint32_t)abstime;
580
581 /* smallest remaining quantum (250 us) */
582 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
583 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
584 min_std_quantum = (uint32_t)abstime;
585
586 /* quantum for background tasks */
587 clock_interval_to_absolutetime_interval(
588 bg_quantum_us, NSEC_PER_USEC, &abstime);
589 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
590 bg_quantum = (uint32_t)abstime;
591
592 /* scheduler tick interval */
593 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
594 NSEC_PER_USEC, &abstime);
595 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
596 sched_tick_interval = (uint32_t)abstime;
597
598 /* timeshare load calculation interval & deadline initialization */
599 clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
600 os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
601
602 /*
603 * Compute conversion factor from usage to
604 * timesharing priorities with 5/8 ** n aging.
605 */
606 abstime = (abstime * 5) / 3;
607 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
608 abstime >>= 1;
609 }
610 sched_fixed_shift = shift;
611
612 for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
613 sched_pri_shifts[i] = INT8_MAX;
614 }
615
616 sched_set_max_unsafe_rt_quanta(max_unsafe_rt_quanta);
617 sched_set_max_unsafe_fixed_quanta(max_unsafe_fixed_quanta);
618
619 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
620 thread_depress_time = 1 * std_quantum;
621 default_timeshare_computation = std_quantum / 2;
622 default_timeshare_constraint = std_quantum;
623
624 #if __arm64__
625 perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
626 #endif /* __arm64__ */
627
628 if (nonurgent_preemption_timer_us) {
629 clock_interval_to_absolutetime_interval(nonurgent_preemption_timer_us, NSEC_PER_USEC, &abstime);
630 nonurgent_preemption_timer_abs = abstime;
631 }
632 }
633
634 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
635
636 void
pset_rt_init(processor_set_t pset)637 pset_rt_init(processor_set_t pset)
638 {
639 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
640 int i = pri - BASEPRI_RTQUEUES;
641 rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
642 queue_init(&rqi->pri_queue);
643 rqi->pri_count = 0;
644 rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
645 rqi->pri_constraint = RT_CONSTRAINT_NONE;
646 }
647 os_atomic_init(&pset->rt_runq.count, 0);
648 os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
649 os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
650 os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
651 memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
652 }
653
654 /* epsilon for comparing RT deadlines */
655 int rt_deadline_epsilon_us = 100;
656
657 int
sched_get_rt_deadline_epsilon(void)658 sched_get_rt_deadline_epsilon(void)
659 {
660 return rt_deadline_epsilon_us;
661 }
662
663 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)664 sched_set_rt_deadline_epsilon(int new_epsilon_us)
665 {
666 rt_deadline_epsilon_us = new_epsilon_us;
667
668 uint64_t abstime;
669 clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
670 assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
671 rt_deadline_epsilon = (uint32_t)abstime;
672 }
673
674 static void
sched_realtime_timebase_init(void)675 sched_realtime_timebase_init(void)
676 {
677 uint64_t abstime;
678
679 /* smallest rt computation (50 us) */
680 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
681 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
682 min_rt_quantum = (uint32_t)abstime;
683
684 /* maximum rt computation (50 ms) */
685 clock_interval_to_absolutetime_interval(
686 50, 1000 * NSEC_PER_USEC, &abstime);
687 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
688 max_rt_quantum = (uint32_t)abstime;
689
690 /* constraint threshold for sending backup IPIs (4 ms) */
691 clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
692 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
693 rt_constraint_threshold = (uint32_t)abstime;
694
695 /* epsilon for comparing deadlines */
696 sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
697 }
698
699 void
sched_check_spill(processor_set_t pset,thread_t thread)700 sched_check_spill(processor_set_t pset, thread_t thread)
701 {
702 (void)pset;
703 (void)thread;
704
705 return;
706 }
707
708 bool
sched_thread_should_yield(processor_t processor,thread_t thread)709 sched_thread_should_yield(processor_t processor, thread_t thread)
710 {
711 (void)thread;
712
713 return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
714 }
715
716 /* Default implementations of .steal_thread_enabled */
717 bool
sched_steal_thread_DISABLED(processor_set_t pset)718 sched_steal_thread_DISABLED(processor_set_t pset)
719 {
720 (void)pset;
721 return false;
722 }
723
724 bool
sched_steal_thread_enabled(processor_set_t pset)725 sched_steal_thread_enabled(processor_set_t pset)
726 {
727 return bit_count(pset->node->pset_map) > 1;
728 }
729
730 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
731
732 /*
733 * Set up values for timeshare
734 * loading factors.
735 */
736 static void
load_shift_init(void)737 load_shift_init(void)
738 {
739 int8_t k, *p = sched_load_shifts;
740 uint32_t i, j;
741
742 uint32_t sched_decay_penalty = 1;
743
744 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
745 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
746 }
747
748 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
749 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
750 }
751
752 if (sched_decay_penalty == 0) {
753 /*
754 * There is no penalty for timeshare threads for using too much
755 * CPU, so set all load shifts to INT8_MIN. Even under high load,
756 * sched_pri_shift will be >INT8_MAX, and there will be no
757 * penalty applied to threads (nor will sched_usage be updated per
758 * thread).
759 */
760 for (i = 0; i < NRQS; i++) {
761 sched_load_shifts[i] = INT8_MIN;
762 }
763
764 return;
765 }
766
767 *p++ = INT8_MIN; *p++ = 0;
768
769 /*
770 * For a given system load "i", the per-thread priority
771 * penalty per quantum of CPU usage is ~2^k priority
772 * levels. "sched_decay_penalty" can cause more
773 * array entries to be filled with smaller "k" values
774 */
775 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
776 for (j <<= 1; (i < j) && (i < NRQS); ++i) {
777 *p++ = k;
778 }
779 }
780 }
781
782 static void
preempt_pri_init(void)783 preempt_pri_init(void)
784 {
785 bitmap_t *p = sched_preempt_pri;
786
787 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
788 bitmap_set(p, i);
789 }
790
791 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
792 bitmap_set(p, i);
793 }
794 }
795
796 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
797
798 void
check_monotonic_time(uint64_t ctime)799 check_monotonic_time(uint64_t ctime)
800 {
801 processor_t processor = current_processor();
802 uint64_t last_dispatch = processor->last_dispatch;
803
804 if (last_dispatch > ctime) {
805 panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
806 last_dispatch, ctime);
807 }
808 }
809
810
811 /*
812 * Thread wait timer expiration.
813 * Runs in timer interrupt context with interrupts disabled.
814 */
815 void
thread_timer_expire(void * p0,__unused void * p1)816 thread_timer_expire(void *p0, __unused void *p1)
817 {
818 thread_t thread = (thread_t)p0;
819
820 assert_thread_magic(thread);
821
822 assert(ml_get_interrupts_enabled() == FALSE);
823
824 thread_lock(thread);
825
826 if (thread->wait_timer_armed) {
827 thread->wait_timer_armed = false;
828 clear_wait_internal(thread, THREAD_TIMED_OUT);
829 /* clear_wait_internal may have dropped and retaken the thread lock */
830 }
831
832 thread->wait_timer_active--;
833
834 thread_unlock(thread);
835 }
836
837 /*
838 * thread_unblock:
839 *
840 * Unblock thread on wake up.
841 *
842 * Returns TRUE if the thread should now be placed on the runqueue.
843 *
844 * Thread must be locked.
845 *
846 * Called at splsched().
847 */
848 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)849 thread_unblock(
850 thread_t thread,
851 wait_result_t wresult)
852 {
853 boolean_t ready_for_runq = FALSE;
854 thread_t cthread = current_thread();
855 uint32_t new_run_count;
856 int old_thread_state;
857
858 /*
859 * Set wait_result.
860 */
861 thread->wait_result = wresult;
862
863 /*
864 * Cancel pending wait timer.
865 */
866 if (thread->wait_timer_armed) {
867 if (timer_call_cancel(thread->wait_timer)) {
868 thread->wait_timer_active--;
869 }
870 thread->wait_timer_armed = false;
871 }
872
873 boolean_t aticontext, pidle;
874 ml_get_power_state(&aticontext, &pidle);
875
876 /*
877 * Update scheduling state: not waiting,
878 * set running.
879 */
880 old_thread_state = thread->state;
881 thread->state = (old_thread_state | TH_RUN) &
882 ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT | TH_WAKING);
883
884 if ((old_thread_state & TH_RUN) == 0) {
885 uint64_t ctime = mach_approximate_time();
886
887 check_monotonic_time(ctime);
888
889 thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
890 timer_start(&thread->runnable_timer, ctime);
891
892 ready_for_runq = TRUE;
893
894 if (old_thread_state & TH_WAIT_REPORT) {
895 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
896 }
897
898 /* Update the runnable thread count */
899 new_run_count = SCHED(run_count_incr)(thread);
900
901 #if CONFIG_SCHED_AUTO_JOIN
902 if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
903 work_interval_auto_join_propagate(cthread, thread);
904 }
905 #endif /*CONFIG_SCHED_AUTO_JOIN */
906
907 } else {
908 /*
909 * Either the thread is idling in place on another processor,
910 * or it hasn't finished context switching yet.
911 */
912 assert((thread->state & TH_IDLE) == 0);
913 /*
914 * The run count is only dropped after the context switch completes
915 * and the thread is still waiting, so we should not run_incr here
916 */
917 new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
918 }
919
920 /*
921 * Calculate deadline for real-time threads.
922 */
923 if (thread->sched_mode == TH_MODE_REALTIME) {
924 uint64_t ctime = mach_absolute_time();
925 thread->realtime.deadline = thread->realtime.constraint + ctime;
926 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
927 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
928 }
929
930 /*
931 * Clear old quantum, fail-safe computation, etc.
932 */
933 thread->quantum_remaining = 0;
934 thread->computation_metered = 0;
935 thread->reason = AST_NONE;
936 thread->block_hint = kThreadWaitNone;
937
938 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
939 * We also account for "double hop" thread signaling via
940 * the thread callout infrastructure.
941 * DRK: consider removing the callout wakeup counters in the future
942 * they're present for verification at the moment.
943 */
944
945 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
946 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
947
948 uint64_t ttd = current_processor()->timer_call_ttd;
949
950 if (ttd) {
951 if (ttd <= timer_deadline_tracking_bin_1) {
952 thread->thread_timer_wakeups_bin_1++;
953 } else if (ttd <= timer_deadline_tracking_bin_2) {
954 thread->thread_timer_wakeups_bin_2++;
955 }
956 }
957
958 ledger_credit_thread(thread, thread->t_ledger,
959 task_ledgers.interrupt_wakeups, 1);
960 if (pidle) {
961 ledger_credit_thread(thread, thread->t_ledger,
962 task_ledgers.platform_idle_wakeups, 1);
963 }
964 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
965 /* TODO: what about an interrupt that does a wake taken on a callout thread? */
966 if (cthread->callout_woken_from_icontext) {
967 ledger_credit_thread(thread, thread->t_ledger,
968 task_ledgers.interrupt_wakeups, 1);
969 thread->thread_callout_interrupt_wakeups++;
970
971 if (cthread->callout_woken_from_platform_idle) {
972 ledger_credit_thread(thread, thread->t_ledger,
973 task_ledgers.platform_idle_wakeups, 1);
974 thread->thread_callout_platform_idle_wakeups++;
975 }
976
977 cthread->callout_woke_thread = TRUE;
978 }
979 }
980
981 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
982 thread->callout_woken_from_icontext = !!aticontext;
983 thread->callout_woken_from_platform_idle = !!pidle;
984 thread->callout_woke_thread = FALSE;
985 }
986
987 #if KPERF
988 if (ready_for_runq) {
989 kperf_make_runnable(thread, aticontext);
990 }
991 #endif /* KPERF */
992
993 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
994 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
995 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
996 sched_run_buckets[TH_BUCKET_RUN], 0);
997
998 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
999
1000 return ready_for_runq;
1001 }
1002
1003 /*
1004 * Routine: thread_allowed_for_handoff
1005 * Purpose:
1006 * Check if the thread is allowed for handoff operation
1007 * Conditions:
1008 * thread lock held, IPC locks may be held.
1009 * TODO: In future, do not allow handoff if threads have different cluster
1010 * recommendations.
1011 */
1012 boolean_t
thread_allowed_for_handoff(thread_t thread)1013 thread_allowed_for_handoff(
1014 thread_t thread)
1015 {
1016 thread_t self = current_thread();
1017
1018 if (allow_direct_handoff &&
1019 thread->sched_mode == TH_MODE_REALTIME &&
1020 self->sched_mode == TH_MODE_REALTIME) {
1021 return TRUE;
1022 }
1023
1024 return FALSE;
1025 }
1026
1027 /*
1028 * Routine: thread_go
1029 * Purpose:
1030 * Unblock and dispatch thread.
1031 * Conditions:
1032 * thread lock held, IPC locks may be held.
1033 * thread must have been waiting
1034 */
1035 void
thread_go(thread_t thread,wait_result_t wresult,bool try_handoff)1036 thread_go(
1037 thread_t thread,
1038 wait_result_t wresult,
1039 bool try_handoff)
1040 {
1041 thread_t self = current_thread();
1042
1043 assert_thread_magic(thread);
1044
1045 assert(thread->at_safe_point == FALSE);
1046 assert(thread->wait_event == NO_EVENT64);
1047 assert(waitq_is_null(thread->waitq));
1048
1049 assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1050 assert(thread->state & TH_WAIT);
1051
1052 if (thread->started) {
1053 assert(thread->state & TH_WAKING);
1054 }
1055
1056 thread_lock_assert(thread, LCK_ASSERT_OWNED);
1057
1058 assert(ml_get_interrupts_enabled() == false);
1059
1060 if (thread_unblock(thread, wresult)) {
1061 #if SCHED_TRACE_THREAD_WAKEUPS
1062 backtrace(&thread->thread_wakeup_bt[0],
1063 (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1064 NULL);
1065 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1066 if (try_handoff && thread_allowed_for_handoff(thread)) {
1067 thread_reference(thread);
1068 assert(self->handoff_thread == NULL);
1069 self->handoff_thread = thread;
1070 } else {
1071 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1072 }
1073 }
1074 }
1075
1076 /*
1077 * Routine: thread_mark_wait_locked
1078 * Purpose:
1079 * Mark a thread as waiting. If, given the circumstances,
1080 * it doesn't want to wait (i.e. already aborted), then
1081 * indicate that in the return value.
1082 * Conditions:
1083 * at splsched() and thread is locked.
1084 */
1085 __private_extern__
1086 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1087 thread_mark_wait_locked(
1088 thread_t thread,
1089 wait_interrupt_t interruptible_orig)
1090 {
1091 boolean_t at_safe_point;
1092 wait_interrupt_t interruptible = interruptible_orig;
1093
1094 if (thread->state & TH_IDLE) {
1095 panic("Invalid attempt to wait while running the idle thread");
1096 }
1097
1098 assert(!(thread->state & (TH_WAIT | TH_WAKING | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1099
1100 /*
1101 * The thread may have certain types of interrupts/aborts masked
1102 * off. Even if the wait location says these types of interrupts
1103 * are OK, we have to honor mask settings (outer-scoped code may
1104 * not be able to handle aborts at the moment).
1105 */
1106 interruptible &= TH_OPT_INTMASK;
1107 if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1108 interruptible = thread->options & TH_OPT_INTMASK;
1109 }
1110
1111 at_safe_point = (interruptible == THREAD_ABORTSAFE);
1112
1113 if (interruptible == THREAD_UNINT ||
1114 !(thread->sched_flags & TH_SFLAG_ABORT) ||
1115 (!at_safe_point &&
1116 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1117 if (!(thread->state & TH_TERMINATE)) {
1118 DTRACE_SCHED(sleep);
1119 }
1120
1121 int state_bits = TH_WAIT;
1122 if (!interruptible) {
1123 state_bits |= TH_UNINT;
1124 }
1125 if (thread->sched_call) {
1126 wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1127 if (is_kerneltask(get_threadtask(thread))) {
1128 mask = THREAD_WAIT_NOREPORT_KERNEL;
1129 }
1130 if ((interruptible_orig & mask) == 0) {
1131 state_bits |= TH_WAIT_REPORT;
1132 }
1133 }
1134 thread->state |= state_bits;
1135 thread->at_safe_point = at_safe_point;
1136
1137 /* TODO: pass this through assert_wait instead, have
1138 * assert_wait just take a struct as an argument */
1139 assert(!thread->block_hint);
1140 thread->block_hint = thread->pending_block_hint;
1141 thread->pending_block_hint = kThreadWaitNone;
1142
1143 return thread->wait_result = THREAD_WAITING;
1144 } else {
1145 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1146 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1147 }
1148 }
1149 thread->pending_block_hint = kThreadWaitNone;
1150
1151 return thread->wait_result = THREAD_INTERRUPTED;
1152 }
1153
1154 /*
1155 * Routine: thread_interrupt_level
1156 * Purpose:
1157 * Set the maximum interruptible state for the
1158 * current thread. The effective value of any
1159 * interruptible flag passed into assert_wait
1160 * will never exceed this.
1161 *
1162 * Useful for code that must not be interrupted,
1163 * but which calls code that doesn't know that.
1164 * Returns:
1165 * The old interrupt level for the thread.
1166 */
1167 __private_extern__
1168 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1169 thread_interrupt_level(
1170 wait_interrupt_t new_level)
1171 {
1172 thread_t thread = current_thread();
1173 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1174
1175 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1176
1177 return result;
1178 }
1179
1180 /*
1181 * assert_wait:
1182 *
1183 * Assert that the current thread is about to go to
1184 * sleep until the specified event occurs.
1185 */
1186 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1187 assert_wait(
1188 event_t event,
1189 wait_interrupt_t interruptible)
1190 {
1191 if (__improbable(event == NO_EVENT)) {
1192 panic("%s() called with NO_EVENT", __func__);
1193 }
1194
1195 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1196 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1197 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1198
1199 struct waitq *waitq;
1200 waitq = global_eventq(event);
1201 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1202 }
1203
1204 /*
1205 * assert_wait_queue:
1206 *
1207 * Return the global waitq for the specified event
1208 */
1209 struct waitq *
assert_wait_queue(event_t event)1210 assert_wait_queue(
1211 event_t event)
1212 {
1213 return global_eventq(event);
1214 }
1215
1216 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1217 assert_wait_timeout(
1218 event_t event,
1219 wait_interrupt_t interruptible,
1220 uint32_t interval,
1221 uint32_t scale_factor)
1222 {
1223 thread_t thread = current_thread();
1224 wait_result_t wresult;
1225 uint64_t deadline;
1226 spl_t s;
1227
1228 if (__improbable(event == NO_EVENT)) {
1229 panic("%s() called with NO_EVENT", __func__);
1230 }
1231
1232 struct waitq *waitq;
1233 waitq = global_eventq(event);
1234
1235 s = splsched();
1236 waitq_lock(waitq);
1237
1238 clock_interval_to_deadline(interval, scale_factor, &deadline);
1239
1240 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1241 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1242 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1243
1244 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1245 interruptible,
1246 TIMEOUT_URGENCY_SYS_NORMAL,
1247 deadline, TIMEOUT_NO_LEEWAY,
1248 thread);
1249
1250 waitq_unlock(waitq);
1251 splx(s);
1252 return wresult;
1253 }
1254
1255 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1256 assert_wait_timeout_with_leeway(
1257 event_t event,
1258 wait_interrupt_t interruptible,
1259 wait_timeout_urgency_t urgency,
1260 uint32_t interval,
1261 uint32_t leeway,
1262 uint32_t scale_factor)
1263 {
1264 thread_t thread = current_thread();
1265 wait_result_t wresult;
1266 uint64_t deadline;
1267 uint64_t abstime;
1268 uint64_t slop;
1269 uint64_t now;
1270 spl_t s;
1271
1272 if (__improbable(event == NO_EVENT)) {
1273 panic("%s() called with NO_EVENT", __func__);
1274 }
1275
1276 now = mach_absolute_time();
1277 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1278 deadline = now + abstime;
1279
1280 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1281
1282 struct waitq *waitq;
1283 waitq = global_eventq(event);
1284
1285 s = splsched();
1286 waitq_lock(waitq);
1287
1288 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1289 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1290 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1291
1292 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1293 interruptible,
1294 urgency, deadline, slop,
1295 thread);
1296
1297 waitq_unlock(waitq);
1298 splx(s);
1299 return wresult;
1300 }
1301
1302 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1303 assert_wait_deadline(
1304 event_t event,
1305 wait_interrupt_t interruptible,
1306 uint64_t deadline)
1307 {
1308 thread_t thread = current_thread();
1309 wait_result_t wresult;
1310 spl_t s;
1311
1312 if (__improbable(event == NO_EVENT)) {
1313 panic("%s() called with NO_EVENT", __func__);
1314 }
1315
1316 struct waitq *waitq;
1317 waitq = global_eventq(event);
1318
1319 s = splsched();
1320 waitq_lock(waitq);
1321
1322 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1323 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1324 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1325
1326 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1327 interruptible,
1328 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1329 TIMEOUT_NO_LEEWAY, thread);
1330 waitq_unlock(waitq);
1331 splx(s);
1332 return wresult;
1333 }
1334
1335 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1336 assert_wait_deadline_with_leeway(
1337 event_t event,
1338 wait_interrupt_t interruptible,
1339 wait_timeout_urgency_t urgency,
1340 uint64_t deadline,
1341 uint64_t leeway)
1342 {
1343 thread_t thread = current_thread();
1344 wait_result_t wresult;
1345 spl_t s;
1346
1347 if (__improbable(event == NO_EVENT)) {
1348 panic("%s() called with NO_EVENT", __func__);
1349 }
1350
1351 struct waitq *waitq;
1352 waitq = global_eventq(event);
1353
1354 s = splsched();
1355 waitq_lock(waitq);
1356
1357 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1358 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1359 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1360
1361 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1362 interruptible,
1363 urgency, deadline, leeway,
1364 thread);
1365 waitq_unlock(waitq);
1366 splx(s);
1367 return wresult;
1368 }
1369
1370 void
sched_cond_init(sched_cond_atomic_t * cond)1371 sched_cond_init(
1372 sched_cond_atomic_t *cond)
1373 {
1374 os_atomic_init(cond, SCHED_COND_INIT);
1375 }
1376
1377 wait_result_t
sched_cond_wait_parameter(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation,void * parameter)1378 sched_cond_wait_parameter(
1379 sched_cond_atomic_t *cond,
1380 wait_interrupt_t interruptible,
1381 thread_continue_t continuation,
1382 void *parameter)
1383 {
1384 assert_wait((event_t) cond, interruptible);
1385 /* clear active bit to indicate future wakeups will have to unblock this thread */
1386 sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed);
1387 if (__improbable(new_state & SCHED_COND_WAKEUP)) {
1388 /* a wakeup has been issued; undo wait assertion, ack the wakeup, and return */
1389 thread_t thread = current_thread();
1390 clear_wait(thread, THREAD_AWAKENED);
1391 sched_cond_ack(cond);
1392 return THREAD_AWAKENED;
1393 }
1394 return thread_block_parameter(continuation, parameter);
1395 }
1396
1397 wait_result_t
sched_cond_wait(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation)1398 sched_cond_wait(
1399 sched_cond_atomic_t *cond,
1400 wait_interrupt_t interruptible,
1401 thread_continue_t continuation)
1402 {
1403 return sched_cond_wait_parameter(cond, interruptible, continuation, NULL);
1404 }
1405
1406 sched_cond_t
sched_cond_ack(sched_cond_atomic_t * cond)1407 sched_cond_ack(
1408 sched_cond_atomic_t *cond)
1409 {
1410 sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE | SCHED_COND_WAKEUP, acquire);
1411 assert(new_cond & SCHED_COND_ACTIVE);
1412 return new_cond;
1413 }
1414
1415 kern_return_t
sched_cond_signal(sched_cond_atomic_t * cond,thread_t thread)1416 sched_cond_signal(
1417 sched_cond_atomic_t *cond,
1418 thread_t thread)
1419 {
1420 disable_preemption();
1421 sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release);
1422 if (!(old_cond & (SCHED_COND_WAKEUP | SCHED_COND_ACTIVE))) {
1423 /* this was the first wakeup to be issued AND the thread was inactive */
1424 thread_wakeup_thread((event_t) cond, thread);
1425 }
1426 enable_preemption();
1427 return KERN_SUCCESS;
1428 }
1429
1430 /*
1431 * thread_isoncpu:
1432 *
1433 * Return TRUE if a thread is running on a processor such that an AST
1434 * is needed to pull it out of userspace execution, or if executing in
1435 * the kernel, bring to a context switch boundary that would cause
1436 * thread state to be serialized in the thread PCB.
1437 *
1438 * Thread locked, returns the same way. While locked, fields
1439 * like "state" cannot change. "runq" can change only from set to unset.
1440 */
1441 static inline boolean_t
thread_isoncpu(thread_t thread)1442 thread_isoncpu(thread_t thread)
1443 {
1444 /* Not running or runnable */
1445 if (!(thread->state & TH_RUN)) {
1446 return FALSE;
1447 }
1448
1449 /* Waiting on a runqueue, not currently running */
1450 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1451 if (thread->runq != PROCESSOR_NULL) {
1452 return FALSE;
1453 }
1454
1455 /*
1456 * Thread does not have a stack yet
1457 * It could be on the stack alloc queue or preparing to be invoked
1458 */
1459 if (!thread->kernel_stack) {
1460 return FALSE;
1461 }
1462
1463 /*
1464 * Thread must be running on a processor, or
1465 * about to run, or just did run. In all these
1466 * cases, an AST to the processor is needed
1467 * to guarantee that the thread is kicked out
1468 * of userspace and the processor has
1469 * context switched (and saved register state).
1470 */
1471 return TRUE;
1472 }
1473
1474 /*
1475 * thread_stop:
1476 *
1477 * Force a preemption point for a thread and wait
1478 * for it to stop running on a CPU. If a stronger
1479 * guarantee is requested, wait until no longer
1480 * runnable. Arbitrates access among
1481 * multiple stop requests. (released by unstop)
1482 *
1483 * The thread must enter a wait state and stop via a
1484 * separate means.
1485 *
1486 * Returns FALSE if interrupted.
1487 */
1488 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1489 thread_stop(
1490 thread_t thread,
1491 boolean_t until_not_runnable)
1492 {
1493 wait_result_t wresult;
1494 spl_t s = splsched();
1495 boolean_t oncpu;
1496
1497 wake_lock(thread);
1498 thread_lock(thread);
1499
1500 while (thread->state & TH_SUSP) {
1501 thread->wake_active = TRUE;
1502 thread_unlock(thread);
1503
1504 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1505 wake_unlock(thread);
1506 splx(s);
1507
1508 if (wresult == THREAD_WAITING) {
1509 wresult = thread_block(THREAD_CONTINUE_NULL);
1510 }
1511
1512 if (wresult != THREAD_AWAKENED) {
1513 return FALSE;
1514 }
1515
1516 s = splsched();
1517 wake_lock(thread);
1518 thread_lock(thread);
1519 }
1520
1521 thread->state |= TH_SUSP;
1522
1523 while ((oncpu = thread_isoncpu(thread)) ||
1524 (until_not_runnable && (thread->state & TH_RUN))) {
1525 processor_t processor;
1526
1527 if (oncpu) {
1528 assert(thread->state & TH_RUN);
1529 processor = thread->chosen_processor;
1530 cause_ast_check(processor);
1531 }
1532
1533 thread->wake_active = TRUE;
1534 thread_unlock(thread);
1535
1536 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1537 wake_unlock(thread);
1538 splx(s);
1539
1540 if (wresult == THREAD_WAITING) {
1541 wresult = thread_block(THREAD_CONTINUE_NULL);
1542 }
1543
1544 if (wresult != THREAD_AWAKENED) {
1545 thread_unstop(thread);
1546 return FALSE;
1547 }
1548
1549 s = splsched();
1550 wake_lock(thread);
1551 thread_lock(thread);
1552 }
1553
1554 thread_unlock(thread);
1555 wake_unlock(thread);
1556 splx(s);
1557
1558 /*
1559 * We return with the thread unlocked. To prevent it from
1560 * transitioning to a runnable state (or from TH_RUN to
1561 * being on the CPU), the caller must ensure the thread
1562 * is stopped via an external means (such as an AST)
1563 */
1564
1565 return TRUE;
1566 }
1567
1568 /*
1569 * thread_unstop:
1570 *
1571 * Release a previous stop request and set
1572 * the thread running if appropriate.
1573 *
1574 * Use only after a successful stop operation.
1575 */
1576 void
thread_unstop(thread_t thread)1577 thread_unstop(
1578 thread_t thread)
1579 {
1580 spl_t s = splsched();
1581
1582 wake_lock(thread);
1583 thread_lock(thread);
1584
1585 assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1586
1587 if (thread->state & TH_SUSP) {
1588 thread->state &= ~TH_SUSP;
1589
1590 if (thread->wake_active) {
1591 thread->wake_active = FALSE;
1592 thread_unlock(thread);
1593
1594 thread_wakeup(&thread->wake_active);
1595 wake_unlock(thread);
1596 splx(s);
1597
1598 return;
1599 }
1600 }
1601
1602 thread_unlock(thread);
1603 wake_unlock(thread);
1604 splx(s);
1605 }
1606
1607 /*
1608 * thread_wait:
1609 *
1610 * Wait for a thread to stop running. (non-interruptible)
1611 *
1612 */
1613 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1614 thread_wait(
1615 thread_t thread,
1616 boolean_t until_not_runnable)
1617 {
1618 wait_result_t wresult;
1619 boolean_t oncpu;
1620 processor_t processor;
1621 spl_t s = splsched();
1622
1623 wake_lock(thread);
1624 thread_lock(thread);
1625
1626 /*
1627 * Wait until not running on a CPU. If stronger requirement
1628 * desired, wait until not runnable. Assumption: if thread is
1629 * on CPU, then TH_RUN is set, so we're not waiting in any case
1630 * where the original, pure "TH_RUN" check would have let us
1631 * finish.
1632 */
1633 while ((oncpu = thread_isoncpu(thread)) ||
1634 (until_not_runnable && (thread->state & TH_RUN))) {
1635 if (oncpu) {
1636 assert(thread->state & TH_RUN);
1637 processor = thread->chosen_processor;
1638 cause_ast_check(processor);
1639 }
1640
1641 thread->wake_active = TRUE;
1642 thread_unlock(thread);
1643
1644 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1645 wake_unlock(thread);
1646 splx(s);
1647
1648 if (wresult == THREAD_WAITING) {
1649 thread_block(THREAD_CONTINUE_NULL);
1650 }
1651
1652 s = splsched();
1653 wake_lock(thread);
1654 thread_lock(thread);
1655 }
1656
1657 thread_unlock(thread);
1658 wake_unlock(thread);
1659 splx(s);
1660 }
1661
1662 /*
1663 * Routine: clear_wait_internal
1664 *
1665 * Clear the wait condition for the specified thread.
1666 * Start the thread executing if that is appropriate.
1667 * Arguments:
1668 * thread thread to awaken
1669 * result Wakeup result the thread should see
1670 * Conditions:
1671 * At splsched
1672 * the thread is locked.
1673 * Returns:
1674 * KERN_SUCCESS thread was rousted out a wait
1675 * KERN_FAILURE thread was waiting but could not be rousted
1676 * KERN_NOT_WAITING thread was not waiting
1677 */
1678 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1679 clear_wait_internal(
1680 thread_t thread,
1681 wait_result_t wresult)
1682 {
1683 waitq_t waitq = thread->waitq;
1684
1685 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1686 return KERN_FAILURE;
1687 }
1688
1689 /*
1690 * Check that the thread is waiting and not waking, as a waking thread
1691 * has already cleared its waitq, and is destined to be go'ed, don't
1692 * need to do it again.
1693 */
1694 if ((thread->state & (TH_WAIT | TH_TERMINATE | TH_WAKING)) != TH_WAIT) {
1695 assert(waitq_is_null(thread->waitq));
1696 return KERN_NOT_WAITING;
1697 }
1698
1699 /* may drop and retake the thread lock */
1700 if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1701 return KERN_NOT_WAITING;
1702 }
1703
1704 thread_go(thread, wresult, /* handoff */ false);
1705
1706 return KERN_SUCCESS;
1707 }
1708
1709
1710 /*
1711 * clear_wait:
1712 *
1713 * Clear the wait condition for the specified thread. Start the thread
1714 * executing if that is appropriate.
1715 *
1716 * parameters:
1717 * thread thread to awaken
1718 * result Wakeup result the thread should see
1719 */
1720 kern_return_t
clear_wait(thread_t thread,wait_result_t result)1721 clear_wait(
1722 thread_t thread,
1723 wait_result_t result)
1724 {
1725 kern_return_t ret;
1726 spl_t s;
1727
1728 s = splsched();
1729 thread_lock(thread);
1730
1731 ret = clear_wait_internal(thread, result);
1732
1733 if (thread == current_thread()) {
1734 /*
1735 * The thread must be ready to wait again immediately
1736 * after clearing its own wait.
1737 */
1738 assert((thread->state & TH_WAKING) == 0);
1739 }
1740
1741 thread_unlock(thread);
1742 splx(s);
1743 return ret;
1744 }
1745
1746
1747 /*
1748 * thread_wakeup_prim:
1749 *
1750 * Common routine for thread_wakeup, thread_wakeup_with_result,
1751 * and thread_wakeup_one.
1752 *
1753 */
1754 kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1755 thread_wakeup_prim(
1756 event_t event,
1757 boolean_t one_thread,
1758 wait_result_t result)
1759 {
1760 if (__improbable(event == NO_EVENT)) {
1761 panic("%s() called with NO_EVENT", __func__);
1762 }
1763
1764 struct waitq *wq = global_eventq(event);
1765
1766 if (one_thread) {
1767 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1768 } else {
1769 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1770 }
1771 }
1772
1773 /*
1774 * Wakeup a specified thread if and only if it's waiting for this event
1775 */
1776 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1777 thread_wakeup_thread(
1778 event_t event,
1779 thread_t thread)
1780 {
1781 if (__improbable(event == NO_EVENT)) {
1782 panic("%s() called with NO_EVENT", __func__);
1783 }
1784
1785 if (__improbable(thread == THREAD_NULL)) {
1786 panic("%s() called with THREAD_NULL", __func__);
1787 }
1788
1789 struct waitq *wq = global_eventq(event);
1790
1791 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1792 }
1793
1794 /*
1795 * Wakeup a thread waiting on an event and promote it to a priority.
1796 *
1797 * Requires woken thread to un-promote itself when done.
1798 */
1799 kern_return_t
thread_wakeup_one_with_pri(event_t event,int priority)1800 thread_wakeup_one_with_pri(
1801 event_t event,
1802 int priority)
1803 {
1804 if (__improbable(event == NO_EVENT)) {
1805 panic("%s() called with NO_EVENT", __func__);
1806 }
1807
1808 struct waitq *wq = global_eventq(event);
1809
1810 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1811 }
1812
1813 /*
1814 * Wakeup a thread waiting on an event,
1815 * promote it to a priority,
1816 * and return a reference to the woken thread.
1817 *
1818 * Requires woken thread to un-promote itself when done.
1819 */
1820 thread_t
thread_wakeup_identify(event_t event,int priority)1821 thread_wakeup_identify(event_t event,
1822 int priority)
1823 {
1824 if (__improbable(event == NO_EVENT)) {
1825 panic("%s() called with NO_EVENT", __func__);
1826 }
1827
1828 struct waitq *wq = global_eventq(event);
1829
1830 return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1831 }
1832
1833 /*
1834 * thread_bind:
1835 *
1836 * Force the current thread to execute on the specified processor.
1837 * Takes effect after the next thread_block().
1838 *
1839 * Returns the previous binding. PROCESSOR_NULL means
1840 * not bound.
1841 *
1842 * XXX - DO NOT export this to users - XXX
1843 */
1844 processor_t
thread_bind(processor_t processor)1845 thread_bind(
1846 processor_t processor)
1847 {
1848 thread_t self = current_thread();
1849 processor_t prev;
1850 spl_t s;
1851
1852 s = splsched();
1853 thread_lock(self);
1854
1855 prev = thread_bind_internal(self, processor);
1856
1857 thread_unlock(self);
1858 splx(s);
1859
1860 return prev;
1861 }
1862
1863 void
thread_bind_during_wakeup(thread_t thread,processor_t processor)1864 thread_bind_during_wakeup(thread_t thread, processor_t processor)
1865 {
1866 assert(!ml_get_interrupts_enabled());
1867 assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1868 #if MACH_ASSERT
1869 thread_lock_assert(thread, LCK_ASSERT_OWNED);
1870 #endif
1871
1872 if (thread->bound_processor != processor) {
1873 thread_bind_internal(thread, processor);
1874 }
1875 }
1876
1877 void
thread_unbind_after_queue_shutdown(thread_t thread,processor_t processor __assert_only)1878 thread_unbind_after_queue_shutdown(
1879 thread_t thread,
1880 processor_t processor __assert_only)
1881 {
1882 assert(!ml_get_interrupts_enabled());
1883
1884 thread_lock(thread);
1885
1886 if (thread->bound_processor) {
1887 bool removed;
1888
1889 assert(thread->bound_processor == processor);
1890
1891 removed = thread_run_queue_remove(thread);
1892 /*
1893 * we can always unbind even if we didn't really remove the
1894 * thread from the runqueue
1895 */
1896 thread_bind_internal(thread, PROCESSOR_NULL);
1897 if (removed) {
1898 thread_run_queue_reinsert(thread, SCHED_TAILQ);
1899 }
1900 }
1901
1902 thread_unlock(thread);
1903 }
1904
1905 /*
1906 * thread_bind_internal:
1907 *
1908 * If the specified thread is not the current thread, and it is currently
1909 * running on another CPU, a remote AST must be sent to that CPU to cause
1910 * the thread to migrate to its bound processor. Otherwise, the migration
1911 * will occur at the next quantum expiration or blocking point.
1912 *
1913 * When the thread is the current thread, and explicit thread_block() should
1914 * be used to force the current processor to context switch away and
1915 * let the thread migrate to the bound processor.
1916 *
1917 * Thread must be locked, and at splsched.
1918 */
1919
1920 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)1921 thread_bind_internal(
1922 thread_t thread,
1923 processor_t processor)
1924 {
1925 processor_t prev;
1926
1927 /* <rdar://problem/15102234> */
1928 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1929 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1930 assert(thread->runq == PROCESSOR_NULL);
1931
1932 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND),
1933 thread_tid(thread), processor ? processor->cpu_id : ~0ul, 0, 0, 0);
1934
1935 prev = thread->bound_processor;
1936 thread->bound_processor = processor;
1937
1938 return prev;
1939 }
1940
1941 /*
1942 * thread_vm_bind_group_add:
1943 *
1944 * The "VM bind group" is a special mechanism to mark a collection
1945 * of threads from the VM subsystem that, in general, should be scheduled
1946 * with only one CPU of parallelism. To accomplish this, we initially
1947 * bind all the threads to the master processor, which has the effect
1948 * that only one of the threads in the group can execute at once, including
1949 * preempting threads in the group that are a lower priority. Future
1950 * mechanisms may use more dynamic mechanisms to prevent the collection
1951 * of VM threads from using more CPU time than desired.
1952 *
1953 * The current implementation can result in priority inversions where
1954 * compute-bound priority 95 or realtime threads that happen to have
1955 * landed on the master processor prevent the VM threads from running.
1956 * When this situation is detected, we unbind the threads for one
1957 * scheduler tick to allow the scheduler to run the threads an
1958 * additional CPUs, before restoring the binding (assuming high latency
1959 * is no longer a problem).
1960 */
1961
1962 /*
1963 * The current max is provisioned for:
1964 * vm_compressor_swap_trigger_thread (92)
1965 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1966 * vm_pageout_continue (92)
1967 * memorystatus_thread (95)
1968 */
1969 #define MAX_VM_BIND_GROUP_COUNT (5)
1970 decl_simple_lock_data(static, sched_vm_group_list_lock);
1971 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1972 static int sched_vm_group_thread_count;
1973 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1974
1975 void
thread_vm_bind_group_add(void)1976 thread_vm_bind_group_add(void)
1977 {
1978 thread_t self = current_thread();
1979
1980 thread_reference(self);
1981 self->options |= TH_OPT_SCHED_VM_GROUP;
1982
1983 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1984 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1985 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1986 simple_unlock(&sched_vm_group_list_lock);
1987
1988 thread_bind(master_processor);
1989
1990 /* Switch to bound processor if not already there */
1991 thread_block(THREAD_CONTINUE_NULL);
1992 }
1993
1994 static void
sched_vm_group_maintenance(void)1995 sched_vm_group_maintenance(void)
1996 {
1997 uint64_t ctime = mach_absolute_time();
1998 uint64_t longtime = ctime - sched_tick_interval;
1999 int i;
2000 spl_t s;
2001 boolean_t high_latency_observed = FALSE;
2002 boolean_t runnable_and_not_on_runq_observed = FALSE;
2003 boolean_t bind_target_changed = FALSE;
2004 processor_t bind_target = PROCESSOR_NULL;
2005
2006 /* Make sure nobody attempts to add new threads while we are enumerating them */
2007 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
2008
2009 s = splsched();
2010
2011 for (i = 0; i < sched_vm_group_thread_count; i++) {
2012 thread_t thread = sched_vm_group_thread_list[i];
2013 assert(thread != THREAD_NULL);
2014 thread_lock(thread);
2015 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
2016 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
2017 high_latency_observed = TRUE;
2018 } else if (thread->runq == PROCESSOR_NULL) {
2019 /* There are some cases where a thread be transitiong that also fall into this case */
2020 runnable_and_not_on_runq_observed = TRUE;
2021 }
2022 }
2023 thread_unlock(thread);
2024
2025 if (high_latency_observed && runnable_and_not_on_runq_observed) {
2026 /* All the things we are looking for are true, stop looking */
2027 break;
2028 }
2029 }
2030
2031 splx(s);
2032
2033 if (sched_vm_group_temporarily_unbound) {
2034 /* If we turned off binding, make sure everything is OK before rebinding */
2035 if (!high_latency_observed) {
2036 /* rebind */
2037 bind_target_changed = TRUE;
2038 bind_target = master_processor;
2039 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
2040 }
2041 } else {
2042 /*
2043 * Check if we're in a bad state, which is defined by high
2044 * latency with no core currently executing a thread. If a
2045 * single thread is making progress on a CPU, that means the
2046 * binding concept to reduce parallelism is working as
2047 * designed.
2048 */
2049 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
2050 /* unbind */
2051 bind_target_changed = TRUE;
2052 bind_target = PROCESSOR_NULL;
2053 sched_vm_group_temporarily_unbound = TRUE;
2054 }
2055 }
2056
2057 if (bind_target_changed) {
2058 s = splsched();
2059 for (i = 0; i < sched_vm_group_thread_count; i++) {
2060 thread_t thread = sched_vm_group_thread_list[i];
2061 boolean_t removed;
2062 assert(thread != THREAD_NULL);
2063
2064 thread_lock(thread);
2065 removed = thread_run_queue_remove(thread);
2066 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
2067 thread_bind_internal(thread, bind_target);
2068 } else {
2069 /*
2070 * Thread was in the middle of being context-switched-to,
2071 * or was in the process of blocking. To avoid switching the bind
2072 * state out mid-flight, defer the change if possible.
2073 */
2074 if (bind_target == PROCESSOR_NULL) {
2075 thread_bind_internal(thread, bind_target);
2076 } else {
2077 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
2078 }
2079 }
2080
2081 if (removed) {
2082 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
2083 }
2084 thread_unlock(thread);
2085 }
2086 splx(s);
2087 }
2088
2089 simple_unlock(&sched_vm_group_list_lock);
2090 }
2091
2092 #if defined(__x86_64__)
2093 #define SCHED_AVOID_CPU0 1
2094 #else
2095 #define SCHED_AVOID_CPU0 0
2096 #endif
2097
2098 int sched_allow_rt_smt = 1;
2099 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
2100 int sched_allow_rt_steal = 1;
2101 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
2102
2103 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
2104
2105 int
sched_get_rt_n_backup_processors(void)2106 sched_get_rt_n_backup_processors(void)
2107 {
2108 return sched_rt_n_backup_processors;
2109 }
2110
2111 void
sched_set_rt_n_backup_processors(int n)2112 sched_set_rt_n_backup_processors(int n)
2113 {
2114 if (n < 0) {
2115 n = 0;
2116 } else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
2117 n = SCHED_MAX_BACKUP_PROCESSORS;
2118 }
2119
2120 sched_rt_n_backup_processors = n;
2121 }
2122
2123 int sched_rt_runq_strict_priority = false;
2124
2125 inline static processor_set_t
change_locked_pset(processor_set_t current_pset,processor_set_t new_pset)2126 change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
2127 {
2128 if (current_pset != new_pset) {
2129 pset_unlock(current_pset);
2130 pset_lock(new_pset);
2131 }
2132
2133 return new_pset;
2134 }
2135
2136 /*
2137 * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2138 * rebalancing opportunity exists when a core is (instantaneously) idle, but
2139 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
2140 * IPI thrash if this core does not remain idle following the load balancing ASTs
2141 * Idle "thrash", when IPI issue is followed by idle entry/core power down
2142 * followed by a wakeup shortly thereafter.
2143 */
2144
2145 #if (DEVELOPMENT || DEBUG)
2146 int sched_smt_balance = 1;
2147 #endif
2148
2149 /* Invoked with pset locked, returns with pset unlocked */
2150 bool
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)2151 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2152 {
2153 processor_t ast_processor = NULL;
2154
2155 #if (DEVELOPMENT || DEBUG)
2156 if (__improbable(sched_smt_balance == 0)) {
2157 goto smt_balance_exit;
2158 }
2159 #endif
2160
2161 assert(cprocessor == current_processor());
2162 if (cprocessor->is_SMT == FALSE) {
2163 goto smt_balance_exit;
2164 }
2165
2166 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2167
2168 /* Determine if both this processor and its sibling are idle,
2169 * indicating an SMT rebalancing opportunity.
2170 */
2171 if (sib_processor->state != PROCESSOR_IDLE) {
2172 goto smt_balance_exit;
2173 }
2174
2175 processor_t sprocessor;
2176
2177 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2178 uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2179 ~cpset->primary_map);
2180 for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2181 sprocessor = processor_array[cpuid];
2182 if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2183 (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2184 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2185 if (ipi_type != SCHED_IPI_NONE) {
2186 assert(sprocessor != cprocessor);
2187 ast_processor = sprocessor;
2188 break;
2189 }
2190 }
2191 }
2192
2193 smt_balance_exit:
2194 pset_unlock(cpset);
2195
2196 if (ast_processor) {
2197 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2198 sched_ipi_perform(ast_processor, ipi_type);
2199 }
2200 return false;
2201 }
2202
2203 static cpumap_t
pset_available_cpumap(processor_set_t pset)2204 pset_available_cpumap(processor_set_t pset)
2205 {
2206 return pset->cpu_available_map & pset->recommended_bitmask;
2207 }
2208
2209 int
pset_available_cpu_count(processor_set_t pset)2210 pset_available_cpu_count(processor_set_t pset)
2211 {
2212 return bit_count(pset_available_cpumap(pset));
2213 }
2214
2215 bool
pset_is_recommended(processor_set_t pset)2216 pset_is_recommended(processor_set_t pset)
2217 {
2218 if (!pset) {
2219 return false;
2220 }
2221 return pset_available_cpu_count(pset) > 0;
2222 }
2223
2224 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2225 pset_available_but_not_running_cpumap(processor_set_t pset)
2226 {
2227 return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2228 pset->recommended_bitmask;
2229 }
2230
2231 bool
pset_has_stealable_threads(processor_set_t pset)2232 pset_has_stealable_threads(processor_set_t pset)
2233 {
2234 pset_assert_locked(pset);
2235
2236 cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2237 /*
2238 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2239 * available primary CPUs
2240 */
2241 avail_map &= pset->primary_map;
2242
2243 return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2244 }
2245
2246 static cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)2247 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2248 {
2249 cpumap_t avail_map = pset_available_cpumap(pset);
2250 if (!sched_allow_rt_smt) {
2251 /*
2252 * Secondary CPUs are not allowed to run RT threads, so
2253 * only primary CPUs should be included
2254 */
2255 avail_map &= pset->primary_map;
2256 }
2257
2258 return avail_map & ~pset->realtime_map;
2259 }
2260
2261 static bool
pset_needs_a_followup_IPI(processor_set_t pset)2262 pset_needs_a_followup_IPI(processor_set_t pset)
2263 {
2264 int nbackup_cpus = 0;
2265
2266 if (rt_runq_is_low_latency(pset)) {
2267 nbackup_cpus = sched_rt_n_backup_processors;
2268 }
2269
2270 int rt_rq_count = rt_runq_count(pset);
2271
2272 return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
2273 }
2274
2275 bool
pset_has_stealable_rt_threads(processor_set_t pset)2276 pset_has_stealable_rt_threads(processor_set_t pset)
2277 {
2278 pset_node_t node = pset->node;
2279 if (bit_count(node->pset_map) == 1) {
2280 return false;
2281 }
2282
2283 cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2284
2285 return rt_runq_count(pset) > bit_count(avail_map);
2286 }
2287
2288 static void
pset_update_rt_stealable_state(processor_set_t pset)2289 pset_update_rt_stealable_state(processor_set_t pset)
2290 {
2291 if (pset_has_stealable_rt_threads(pset)) {
2292 pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2293 } else {
2294 pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2295 }
2296 }
2297
2298 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2299 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2300 {
2301 /* Acknowledge any pending IPIs here with pset lock held */
2302 pset_assert_locked(pset);
2303 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2304 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2305 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2306 }
2307 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2308
2309 #if defined(CONFIG_SCHED_DEFERRED_AST)
2310 bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2311 #endif
2312 }
2313
2314 /*
2315 * Called with pset locked, on a processor that is committing to run a new thread
2316 * Will transition an idle or dispatching processor to running as it picks up
2317 * the first new thread from the idle thread.
2318 */
2319 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2320 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2321 {
2322 pset_assert_locked(pset);
2323
2324 if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2325 assert(current_thread() == processor->idle_thread);
2326
2327 /*
2328 * Dispatching processor is now committed to running new_thread,
2329 * so change its state to PROCESSOR_RUNNING.
2330 */
2331 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2332 } else {
2333 assert((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_SHUTDOWN));
2334 }
2335
2336 processor_state_update_from_thread(processor, new_thread, true);
2337
2338 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2339 bit_set(pset->realtime_map, processor->cpu_id);
2340 } else {
2341 bit_clear(pset->realtime_map, processor->cpu_id);
2342 }
2343 pset_update_rt_stealable_state(pset);
2344
2345 pset_node_t node = pset->node;
2346
2347 if (bit_count(node->pset_map) == 1) {
2348 /* Node has only a single pset, so skip node pset map updates */
2349 return;
2350 }
2351
2352 cpumap_t avail_map = pset_available_cpumap(pset);
2353
2354 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2355 if ((avail_map & pset->realtime_map) == avail_map) {
2356 /* No more non-RT CPUs in this pset */
2357 atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2358 }
2359 avail_map &= pset->primary_map;
2360 if ((avail_map & pset->realtime_map) == avail_map) {
2361 /* No more non-RT primary CPUs in this pset */
2362 atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2363 }
2364 } else {
2365 if ((avail_map & pset->realtime_map) != avail_map) {
2366 if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2367 atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2368 }
2369 }
2370 avail_map &= pset->primary_map;
2371 if ((avail_map & pset->realtime_map) != avail_map) {
2372 if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2373 atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2374 }
2375 }
2376 }
2377 }
2378
2379 static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2380 static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2381 processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2382 static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2383 #if defined(__x86_64__)
2384 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2385 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2386 #endif
2387 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2388 static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2389
2390 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2391 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2392 {
2393 pset_map_t pset_map = stealing_pset->node->pset_map;
2394
2395 bit_clear(pset_map, stealing_pset->pset_id);
2396
2397 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2398 processor_set_t nset = pset_array[pset_id];
2399
2400 if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) {
2401 return true;
2402 }
2403 }
2404
2405 return false;
2406 }
2407
2408 /*
2409 * starting_pset must be locked, but returns true if it is unlocked before return
2410 */
2411 static bool
choose_next_rt_processor_for_IPI(processor_set_t starting_pset,processor_t chosen_processor,bool spill_ipi,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)2412 choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2413 processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2414 {
2415 bool starting_pset_is_unlocked = false;
2416 uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
2417 int max_pri = rt_runq_priority(starting_pset);
2418 __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
2419 processor_set_t pset = starting_pset;
2420 processor_t next_rt_processor = PROCESSOR_NULL;
2421 if (spill_ipi) {
2422 processor_set_t nset = next_pset(pset);
2423 assert(nset != starting_pset);
2424 pset = change_locked_pset(pset, nset);
2425 starting_pset_is_unlocked = true;
2426 }
2427 do {
2428 const bool consider_secondaries = true;
2429 next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries);
2430 if (next_rt_processor == PROCESSOR_NULL) {
2431 if (!spill_ipi) {
2432 break;
2433 }
2434 processor_set_t nset = next_pset(pset);
2435 if (nset == starting_pset) {
2436 break;
2437 }
2438 pset = change_locked_pset(pset, nset);
2439 starting_pset_is_unlocked = true;
2440 }
2441 } while (next_rt_processor == PROCESSOR_NULL);
2442 if (next_rt_processor) {
2443 if (pset != starting_pset) {
2444 if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2445 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2446 next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2447 }
2448 }
2449 *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
2450 *result_processor = next_rt_processor;
2451 }
2452 if (pset != starting_pset) {
2453 pset_unlock(pset);
2454 }
2455
2456 return starting_pset_is_unlocked;
2457 }
2458
2459 /*
2460 * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2461 * followup processor - used in thread_select when there are still threads on the run queue and available processors
2462 * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2463 */
2464 typedef enum {
2465 none,
2466 backup,
2467 followup,
2468 spill
2469 } next_processor_type_t;
2470
2471 #undef LOOP_COUNT
2472 #ifdef LOOP_COUNT
2473 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2474 #endif
2475
2476 /*
2477 * thread_select:
2478 *
2479 * Select a new thread for the current processor to execute.
2480 *
2481 * May select the current thread, which must be locked.
2482 */
2483 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2484 thread_select(thread_t thread,
2485 processor_t processor,
2486 ast_t *reason)
2487 {
2488 processor_set_t pset = processor->processor_set;
2489 thread_t new_thread = THREAD_NULL;
2490
2491 assert(processor == current_processor());
2492 assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2493
2494 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2495 0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2496
2497 __kdebug_only int idle_reason = 0;
2498 __kdebug_only int delay_count = 0;
2499
2500 #if defined(__x86_64__)
2501 int timeout_count = sched_backup_cpu_timeout_count;
2502 if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2503 /* Prefer cpu0 as backup */
2504 timeout_count--;
2505 } else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2506 /* Prefer secondary cpu as backup */
2507 timeout_count--;
2508 }
2509 #endif
2510 bool pending_AST_URGENT = false;
2511 bool pending_AST_PREEMPT = false;
2512
2513 #ifdef LOOP_COUNT
2514 int loop_count = -1;
2515 #endif
2516
2517 do {
2518 /*
2519 * Update the priority.
2520 */
2521 if (SCHED(can_update_priority)(thread)) {
2522 SCHED(update_priority)(thread);
2523 }
2524
2525 pset_lock(pset);
2526
2527 restart:
2528 #ifdef LOOP_COUNT
2529 loop_count++;
2530 if (loop_count > max_loop_count[processor->cpu_id]) {
2531 max_loop_count[processor->cpu_id] = loop_count;
2532 if (bit_count(loop_count) == 1) {
2533 kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2534 }
2535 }
2536 #endif
2537 pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2538 pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2539
2540 processor_state_update_from_thread(processor, thread, true);
2541
2542 idle_reason = 0;
2543
2544 processor_t ast_processor = PROCESSOR_NULL;
2545 processor_t next_rt_processor = PROCESSOR_NULL;
2546 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2547 sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2548
2549 assert(processor->state != PROCESSOR_OFF_LINE);
2550
2551 /*
2552 * Bound threads are dispatched to a processor without going through
2553 * choose_processor(), so in those cases we must continue trying to dequeue work
2554 * as we are the only option.
2555 */
2556 if (!SCHED(processor_bound_count)(processor)) {
2557 if (!processor->is_recommended) {
2558 /*
2559 * The performance controller has provided a hint to not dispatch more threads,
2560 */
2561 idle_reason = 1;
2562 goto send_followup_ipi_before_idle;
2563 } else if (rt_runq_count(pset)) {
2564 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2565 /* Give the current RT thread a chance to complete */
2566 ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2567 #if defined(__x86_64__)
2568 /*
2569 * On Intel we want to avoid SMT secondary processors and processor 0
2570 * but allow them to be used as backup processors in case the preferred chosen
2571 * processor is delayed by interrupts or processor stalls. So if it is
2572 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2573 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2574 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2575 * to grab the thread before the (current) backup processor does.
2576 *
2577 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2578 * on DEVELOPMENT || DEBUG kernels. It is also adjusted (see above) depending on whether we want to use
2579 * cpu0 before secondary cpus or not.
2580 */
2581 if (!ok_to_run_realtime_thread) {
2582 if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2583 if (timeout_count-- > 0) {
2584 pset_unlock(pset);
2585 thread_unlock(thread);
2586 delay(10);
2587 delay_count++;
2588 thread_lock(thread);
2589 pset_lock(pset);
2590 goto restart;
2591 }
2592 ok_to_run_realtime_thread = true;
2593 }
2594 }
2595 #endif
2596 if (!ok_to_run_realtime_thread) {
2597 idle_reason = 2;
2598 goto send_followup_ipi_before_idle;
2599 }
2600 } else if (processor->processor_primary != processor) {
2601 /*
2602 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2603 * we should look for work only under the same conditions that choose_processor()
2604 * would have assigned work, which is when all primary processors have been assigned work.
2605 */
2606 if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2607 /* There are idle primaries */
2608 idle_reason = 3;
2609 goto idle;
2610 }
2611 }
2612 }
2613
2614 /*
2615 * Test to see if the current thread should continue
2616 * to run on this processor. Must not be attempting to wait, and not
2617 * bound to a different processor, nor be in the wrong
2618 * processor set, nor be forced to context switch by TH_SUSP.
2619 *
2620 * Note that there are never any RT threads in the regular runqueue.
2621 *
2622 * This code is very insanely tricky.
2623 */
2624
2625 /* i.e. not waiting, not TH_SUSP'ed */
2626 bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2627
2628 /*
2629 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2630 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2631 * <rdar://problem/47907700>
2632 *
2633 * A yielding thread shouldn't be forced to context switch.
2634 */
2635
2636 bool is_yielding = (*reason & AST_YIELD) == AST_YIELD;
2637
2638 bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2639
2640 bool affinity_mismatch = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2641
2642 bool bound_elsewhere = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2643
2644 bool avoid_processor = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, *reason);
2645
2646 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2647
2648 bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2649 if (current_thread_can_keep_running) {
2650 /*
2651 * This thread is eligible to keep running on this processor.
2652 *
2653 * RT threads with un-expired quantum stay on processor,
2654 * unless there's a valid RT thread with an earlier deadline
2655 * and it is still ok_to_run_realtime_thread.
2656 */
2657 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2658 /*
2659 * Pick a new RT thread only if ok_to_run_realtime_thread
2660 * (but the current thread is allowed to complete).
2661 */
2662 if (ok_to_run_realtime_thread) {
2663 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2664 goto pick_new_rt_thread;
2665 }
2666 if (rt_runq_priority(pset) > thread->sched_pri) {
2667 if (sched_rt_runq_strict_priority) {
2668 /* The next RT thread is better, so pick it off the runqueue. */
2669 goto pick_new_rt_thread;
2670 }
2671
2672 /*
2673 * See if the current lower priority thread can continue to run without causing
2674 * the higher priority thread on the runq queue to miss its deadline.
2675 */
2676 thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2677 if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2678 /* The next RT thread is better, so pick it off the runqueue. */
2679 goto pick_new_rt_thread;
2680 }
2681 } else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2682 /* The next RT thread is better, so pick it off the runqueue. */
2683 goto pick_new_rt_thread;
2684 }
2685 if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2686 goto pick_new_rt_thread;
2687 }
2688 }
2689
2690 /* This is still the best RT thread to run. */
2691 processor->deadline = thread->realtime.deadline;
2692
2693 sched_update_pset_load_average(pset, 0);
2694
2695 clear_pending_AST_bits(pset, processor, 1);
2696
2697 next_rt_processor = PROCESSOR_NULL;
2698 next_rt_ipi_type = SCHED_IPI_NONE;
2699
2700 bool pset_unlocked = false;
2701 __kdebug_only next_processor_type_t nptype = none;
2702 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2703 nptype = spill;
2704 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2705 } else if (pset_needs_a_followup_IPI(pset)) {
2706 nptype = followup;
2707 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2708 }
2709 if (!pset_unlocked) {
2710 pset_unlock(pset);
2711 }
2712
2713 if (next_rt_processor) {
2714 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2715 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2716 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2717 }
2718
2719 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2720 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2721 return thread;
2722 }
2723
2724 if ((rt_runq_count(pset) == 0) &&
2725 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2726 /* This thread is still the highest priority runnable (non-idle) thread */
2727 processor->deadline = RT_DEADLINE_NONE;
2728
2729 sched_update_pset_load_average(pset, 0);
2730
2731 clear_pending_AST_bits(pset, processor, 2);
2732
2733 pset_unlock(pset);
2734
2735 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2736 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2737 return thread;
2738 }
2739 } else {
2740 /*
2741 * This processor must context switch.
2742 * If it's due to a rebalance, we should aggressively find this thread a new home.
2743 */
2744 if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) {
2745 *reason |= AST_REBALANCE;
2746 }
2747 }
2748
2749 bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2750 (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2751 (processor->processor_secondary->state == PROCESSOR_IDLE));
2752
2753 /* OK, so we're not going to run the current thread. Look at the RT queue. */
2754 if (ok_to_run_realtime_thread) {
2755 pick_new_rt_thread:
2756 new_thread = sched_rt_choose_thread(pset);
2757 if (new_thread != THREAD_NULL) {
2758 processor->deadline = new_thread->realtime.deadline;
2759 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2760
2761 clear_pending_AST_bits(pset, processor, 3);
2762
2763 if (processor->processor_secondary != NULL) {
2764 processor_t sprocessor = processor->processor_secondary;
2765 if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2766 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2767 ast_processor = sprocessor;
2768 }
2769 }
2770 }
2771 }
2772
2773 send_followup_ipi_before_idle:
2774 /* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2775 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2776 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2777 }
2778 __kdebug_only next_processor_type_t nptype = none;
2779 bool pset_unlocked = false;
2780 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2781 nptype = spill;
2782 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2783 } else if (pset_needs_a_followup_IPI(pset)) {
2784 nptype = followup;
2785 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2786 }
2787
2788 assert(new_thread || !ast_processor);
2789 if (new_thread || next_rt_processor) {
2790 if (!pset_unlocked) {
2791 pset_unlock(pset);
2792 pset_unlocked = true;
2793 }
2794 if (ast_processor == next_rt_processor) {
2795 ast_processor = PROCESSOR_NULL;
2796 ipi_type = SCHED_IPI_NONE;
2797 }
2798
2799 if (ast_processor) {
2800 sched_ipi_perform(ast_processor, ipi_type);
2801 }
2802
2803 if (next_rt_processor) {
2804 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2805 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2806 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2807 }
2808
2809 if (new_thread) {
2810 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2811 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2812 return new_thread;
2813 }
2814 }
2815
2816 if (pset_unlocked) {
2817 pset_lock(pset);
2818 }
2819
2820 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2821 /* Things changed while we dropped the lock */
2822 goto restart;
2823 }
2824
2825 if (processor->is_recommended) {
2826 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2827 if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2828 /* Things changed while we dropped the lock */
2829 goto restart;
2830 }
2831
2832 if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2833 /* secondary can only run realtime thread */
2834 if (idle_reason == 0) {
2835 idle_reason = 4;
2836 }
2837 goto idle;
2838 }
2839 } else if (!SCHED(processor_bound_count)(processor)) {
2840 /* processor not recommended and no bound threads */
2841 if (idle_reason == 0) {
2842 idle_reason = 5;
2843 }
2844 goto idle;
2845 }
2846
2847 processor->deadline = RT_DEADLINE_NONE;
2848
2849 /* No RT threads, so let's look at the regular threads. */
2850 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2851 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2852
2853 clear_pending_AST_bits(pset, processor, 4);
2854
2855 ast_processor = PROCESSOR_NULL;
2856 ipi_type = SCHED_IPI_NONE;
2857
2858 processor_t sprocessor = processor->processor_secondary;
2859 if (sprocessor != NULL) {
2860 if (sprocessor->state == PROCESSOR_RUNNING) {
2861 if (thread_no_smt(new_thread)) {
2862 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2863 ast_processor = sprocessor;
2864 }
2865 } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
2866 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
2867 ast_processor = sprocessor;
2868 }
2869 }
2870 pset_unlock(pset);
2871
2872 if (ast_processor) {
2873 sched_ipi_perform(ast_processor, ipi_type);
2874 }
2875 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2876 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2877 return new_thread;
2878 }
2879
2880 if (processor->must_idle) {
2881 processor->must_idle = false;
2882 *reason |= AST_REBALANCE;
2883 idle_reason = 6;
2884 goto idle;
2885 }
2886
2887 if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2888 /*
2889 * No runnable threads, attempt to steal
2890 * from other processors. Returns with pset lock dropped.
2891 */
2892
2893 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2894 pset_lock(pset);
2895 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2896 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2897 /*
2898 * A realtime thread choose this processor while it was DISPATCHING
2899 * and the pset lock was dropped
2900 */
2901 ast_on(AST_URGENT | AST_PREEMPT);
2902 }
2903
2904 clear_pending_AST_bits(pset, processor, 5);
2905
2906 pset_unlock(pset);
2907
2908 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2909 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2910 return new_thread;
2911 }
2912
2913 /*
2914 * If other threads have appeared, shortcut
2915 * around again.
2916 */
2917 if (SCHED(processor_bound_count)(processor)) {
2918 continue;
2919 }
2920 if (processor->is_recommended) {
2921 if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2922 continue;
2923 }
2924 }
2925
2926 pset_lock(pset);
2927 }
2928
2929 idle:
2930 /* Someone selected this processor while we had dropped the lock */
2931 if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2932 (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2933 goto restart;
2934 }
2935
2936 if ((idle_reason == 0) && current_thread_can_keep_running) {
2937 /* This thread is the only runnable (non-idle) thread */
2938 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2939 processor->deadline = thread->realtime.deadline;
2940 } else {
2941 processor->deadline = RT_DEADLINE_NONE;
2942 }
2943
2944 sched_update_pset_load_average(pset, 0);
2945
2946 clear_pending_AST_bits(pset, processor, 6);
2947
2948 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2949 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2950 pset_unlock(pset);
2951 return thread;
2952 }
2953
2954 /*
2955 * Nothing is runnable, or this processor must be forced idle,
2956 * so set this processor idle if it was running.
2957 */
2958 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2959 pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2960 processor_state_update_idle(processor);
2961 }
2962 pset_update_rt_stealable_state(pset);
2963
2964 clear_pending_AST_bits(pset, processor, 7);
2965
2966 /* Invoked with pset locked, returns with pset unlocked */
2967 processor->next_idle_short = SCHED(processor_balance)(processor, pset);
2968
2969 new_thread = processor->idle_thread;
2970 } while (new_thread == THREAD_NULL);
2971
2972 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2973 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2974 return new_thread;
2975 }
2976
2977 /*
2978 * thread_invoke
2979 *
2980 * Called at splsched with neither thread locked.
2981 *
2982 * Perform a context switch and start executing the new thread.
2983 *
2984 * Returns FALSE when the context switch didn't happen.
2985 * The reference to the new thread is still consumed.
2986 *
2987 * "self" is what is currently running on the processor,
2988 * "thread" is the new thread to context switch to
2989 * (which may be the same thread in some cases)
2990 */
2991 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)2992 thread_invoke(
2993 thread_t self,
2994 thread_t thread,
2995 ast_t reason)
2996 {
2997 if (__improbable(get_preemption_level() != 0)) {
2998 int pl = get_preemption_level();
2999 panic("thread_invoke: preemption_level %d, possible cause: %s",
3000 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
3001 "blocking while holding a spinlock, or within interrupt context"));
3002 }
3003
3004 thread_continue_t continuation = self->continuation;
3005 void *parameter = self->parameter;
3006
3007 struct recount_snap snap = { 0 };
3008 recount_snapshot(&snap);
3009 uint64_t ctime = snap.rsn_time_mach;
3010
3011 check_monotonic_time(ctime);
3012
3013 #ifdef CONFIG_MACH_APPROXIMATE_TIME
3014 commpage_update_mach_approximate_time(ctime);
3015 #endif
3016
3017 if (ctime < thread->last_made_runnable_time) {
3018 panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
3019 ctime, thread->last_made_runnable_time);
3020 }
3021
3022 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3023 if (!((thread->state & TH_IDLE) != 0 ||
3024 ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
3025 sched_timeshare_consider_maintenance(ctime, true);
3026 }
3027 #endif
3028
3029 recount_log_switch_thread(&snap);
3030
3031 assert_thread_magic(self);
3032 assert(self == current_thread());
3033 assert(self->runq == PROCESSOR_NULL);
3034 assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
3035
3036 thread_lock(thread);
3037
3038 assert_thread_magic(thread);
3039 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
3040 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
3041 assert(thread->runq == PROCESSOR_NULL);
3042
3043 /* Update SFI class based on other factors */
3044 thread->sfi_class = sfi_thread_classify(thread);
3045
3046 /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
3047 thread->same_pri_latency = ctime - thread->last_basepri_change_time;
3048 /*
3049 * In case a base_pri update happened between the timestamp and
3050 * taking the thread lock
3051 */
3052 if (ctime <= thread->last_basepri_change_time) {
3053 thread->same_pri_latency = ctime - thread->last_made_runnable_time;
3054 }
3055
3056 /* Allow realtime threads to hang onto a stack. */
3057 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
3058 self->reserved_stack = self->kernel_stack;
3059 }
3060
3061 /* Prepare for spin debugging */
3062 #if SCHED_HYGIENE_DEBUG
3063 ml_spin_debug_clear(thread);
3064 #endif
3065
3066 if (continuation != NULL) {
3067 if (!thread->kernel_stack) {
3068 /*
3069 * If we are using a privileged stack,
3070 * check to see whether we can exchange it with
3071 * that of the other thread.
3072 */
3073 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
3074 goto need_stack;
3075 }
3076
3077 /*
3078 * Context switch by performing a stack handoff.
3079 * Requires both threads to be parked in a continuation.
3080 */
3081 continuation = thread->continuation;
3082 parameter = thread->parameter;
3083
3084 processor_t processor = current_processor();
3085 processor->active_thread = thread;
3086 processor_state_update_from_thread(processor, thread, false);
3087
3088 if (thread->last_processor != processor && thread->last_processor != NULL) {
3089 if (thread->last_processor->processor_set != processor->processor_set) {
3090 thread->ps_switch++;
3091 }
3092 thread->p_switch++;
3093 }
3094 thread->last_processor = processor;
3095 thread->c_switch++;
3096 ast_context(thread);
3097
3098 thread_unlock(thread);
3099
3100 self->reason = reason;
3101
3102 processor->last_dispatch = ctime;
3103 self->last_run_time = ctime;
3104 timer_update(&thread->runnable_timer, ctime);
3105 recount_switch_thread(&snap, self, get_threadtask(self));
3106
3107 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3108 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
3109 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3110
3111 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3112 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3113 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3114 }
3115
3116 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3117
3118 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3119
3120 #if KPERF
3121 kperf_off_cpu(self);
3122 #endif /* KPERF */
3123
3124 /*
3125 * This is where we actually switch thread identity,
3126 * and address space if required. However, register
3127 * state is not switched - this routine leaves the
3128 * stack and register state active on the current CPU.
3129 */
3130 TLOG(1, "thread_invoke: calling stack_handoff\n");
3131 stack_handoff(self, thread);
3132
3133 /* 'self' is now off core */
3134 assert(thread == current_thread_volatile());
3135
3136 DTRACE_SCHED(on__cpu);
3137
3138 #if KPERF
3139 kperf_on_cpu(thread, continuation, NULL);
3140 #endif /* KPERF */
3141
3142 recount_log_switch_thread_on(&snap);
3143
3144 thread_dispatch(self, thread);
3145
3146 #if KASAN
3147 /* Old thread's stack has been moved to the new thread, so explicitly
3148 * unpoison it. */
3149 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3150 #endif
3151
3152 thread->continuation = thread->parameter = NULL;
3153
3154 boolean_t enable_interrupts = TRUE;
3155
3156 /* idle thread needs to stay interrupts-disabled */
3157 if ((thread->state & TH_IDLE)) {
3158 enable_interrupts = FALSE;
3159 }
3160
3161 assert(continuation);
3162 call_continuation(continuation, parameter,
3163 thread->wait_result, enable_interrupts);
3164 /*NOTREACHED*/
3165 } else if (thread == self) {
3166 /* same thread but with continuation */
3167 ast_context(self);
3168
3169 thread_unlock(self);
3170
3171 #if KPERF
3172 kperf_on_cpu(thread, continuation, NULL);
3173 #endif /* KPERF */
3174
3175 recount_log_switch_thread_on(&snap);
3176
3177 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3178 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3179 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3180
3181 #if KASAN
3182 /* stack handoff to self - no thread_dispatch(), so clear the stack
3183 * and free the fakestack directly */
3184 #if KASAN_CLASSIC
3185 kasan_fakestack_drop(self);
3186 kasan_fakestack_gc(self);
3187 #endif /* KASAN_CLASSIC */
3188 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3189 #endif /* KASAN */
3190
3191 self->continuation = self->parameter = NULL;
3192
3193 boolean_t enable_interrupts = TRUE;
3194
3195 /* idle thread needs to stay interrupts-disabled */
3196 if ((self->state & TH_IDLE)) {
3197 enable_interrupts = FALSE;
3198 }
3199
3200 call_continuation(continuation, parameter,
3201 self->wait_result, enable_interrupts);
3202 /*NOTREACHED*/
3203 }
3204 } else {
3205 /*
3206 * Check that the other thread has a stack
3207 */
3208 if (!thread->kernel_stack) {
3209 need_stack:
3210 if (!stack_alloc_try(thread)) {
3211 thread_unlock(thread);
3212 thread_stack_enqueue(thread);
3213 return FALSE;
3214 }
3215 } else if (thread == self) {
3216 ast_context(self);
3217 thread_unlock(self);
3218
3219 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3220 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3221 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3222
3223 return TRUE;
3224 }
3225 }
3226
3227 /*
3228 * Context switch by full context save.
3229 */
3230 processor_t processor = current_processor();
3231 processor->active_thread = thread;
3232 processor_state_update_from_thread(processor, thread, false);
3233
3234 if (thread->last_processor != processor && thread->last_processor != NULL) {
3235 if (thread->last_processor->processor_set != processor->processor_set) {
3236 thread->ps_switch++;
3237 }
3238 thread->p_switch++;
3239 }
3240 thread->last_processor = processor;
3241 thread->c_switch++;
3242 ast_context(thread);
3243
3244 thread_unlock(thread);
3245
3246 self->reason = reason;
3247
3248 processor->last_dispatch = ctime;
3249 self->last_run_time = ctime;
3250 timer_update(&thread->runnable_timer, ctime);
3251 recount_switch_thread(&snap, self, get_threadtask(self));
3252
3253 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3254 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3255 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3256
3257 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3258 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3259 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3260 }
3261
3262 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3263
3264 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3265
3266 #if KPERF
3267 kperf_off_cpu(self);
3268 #endif /* KPERF */
3269
3270 /*
3271 * This is where we actually switch register context,
3272 * and address space if required. We will next run
3273 * as a result of a subsequent context switch.
3274 *
3275 * Once registers are switched and the processor is running "thread",
3276 * the stack variables and non-volatile registers will contain whatever
3277 * was there the last time that thread blocked. No local variables should
3278 * be used after this point, except for the special case of "thread", which
3279 * the platform layer returns as the previous thread running on the processor
3280 * via the function call ABI as a return register, and "self", which may have
3281 * been stored on the stack or a non-volatile register, but a stale idea of
3282 * what was on the CPU is newly-accurate because that thread is again
3283 * running on the CPU.
3284 *
3285 * If one of the threads is using a continuation, thread_continue
3286 * is used to stitch up its context.
3287 *
3288 * If we are invoking a thread which is resuming from a continuation,
3289 * the CPU will invoke thread_continue next.
3290 *
3291 * If the current thread is parking in a continuation, then its state
3292 * won't be saved and the stack will be discarded. When the stack is
3293 * re-allocated, it will be configured to resume from thread_continue.
3294 */
3295
3296 assert(continuation == self->continuation);
3297 thread = machine_switch_context(self, continuation, thread);
3298 assert(self == current_thread_volatile());
3299 TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3300
3301 assert(continuation == NULL && self->continuation == NULL);
3302
3303 DTRACE_SCHED(on__cpu);
3304
3305 #if KPERF
3306 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3307 #endif /* KPERF */
3308
3309 /* Previous snap on the old stack is gone. */
3310 recount_log_switch_thread_on(NULL);
3311
3312 /* We have been resumed and are set to run. */
3313 thread_dispatch(thread, self);
3314
3315 return TRUE;
3316 }
3317
3318 #if defined(CONFIG_SCHED_DEFERRED_AST)
3319 /*
3320 * pset_cancel_deferred_dispatch:
3321 *
3322 * Cancels all ASTs that we can cancel for the given processor set
3323 * if the current processor is running the last runnable thread in the
3324 * system.
3325 *
3326 * This function assumes the current thread is runnable. This must
3327 * be called with the pset unlocked.
3328 */
3329 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3330 pset_cancel_deferred_dispatch(
3331 processor_set_t pset,
3332 processor_t processor)
3333 {
3334 processor_t active_processor = NULL;
3335 uint32_t sampled_sched_run_count;
3336
3337 pset_lock(pset);
3338 sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3339
3340 /*
3341 * If we have emptied the run queue, and our current thread is runnable, we
3342 * should tell any processors that are still DISPATCHING that they will
3343 * probably not have any work to do. In the event that there are no
3344 * pending signals that we can cancel, this is also uninteresting.
3345 *
3346 * In the unlikely event that another thread becomes runnable while we are
3347 * doing this (sched_run_count is atomically updated, not guarded), the
3348 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3349 * in order to dispatch it to a processor in our pset. So, the other
3350 * codepath will wait while we squash all cancelable ASTs, get the pset
3351 * lock, and then dispatch the freshly runnable thread. So this should be
3352 * correct (we won't accidentally have a runnable thread that hasn't been
3353 * dispatched to an idle processor), if not ideal (we may be restarting the
3354 * dispatch process, which could have some overhead).
3355 */
3356
3357 if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3358 uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3359 pset->pending_deferred_AST_cpu_mask &
3360 ~pset->pending_AST_URGENT_cpu_mask);
3361 for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3362 active_processor = processor_array[cpuid];
3363 /*
3364 * If a processor is DISPATCHING, it could be because of
3365 * a cancelable signal.
3366 *
3367 * IF the processor is not our
3368 * current processor (the current processor should not
3369 * be DISPATCHING, so this is a bit paranoid), AND there
3370 * is a cancelable signal pending on the processor, AND
3371 * there is no non-cancelable signal pending (as there is
3372 * no point trying to backtrack on bringing the processor
3373 * up if a signal we cannot cancel is outstanding), THEN
3374 * it should make sense to roll back the processor state
3375 * to the IDLE state.
3376 *
3377 * If the racey nature of this approach (as the signal
3378 * will be arbitrated by hardware, and can fire as we
3379 * roll back state) results in the core responding
3380 * despite being pushed back to the IDLE state, it
3381 * should be no different than if the core took some
3382 * interrupt while IDLE.
3383 */
3384 if (active_processor != processor) {
3385 /*
3386 * Squash all of the processor state back to some
3387 * reasonable facsimile of PROCESSOR_IDLE.
3388 */
3389
3390 processor_state_update_idle(active_processor);
3391 active_processor->deadline = RT_DEADLINE_NONE;
3392 pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3393 bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3394 machine_signal_idle_cancel(active_processor);
3395 }
3396 }
3397 }
3398
3399 pset_unlock(pset);
3400 }
3401 #else
3402 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3403 #endif
3404
3405 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3406 thread_csw_callout(
3407 thread_t old,
3408 thread_t new,
3409 uint64_t timestamp)
3410 {
3411 perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3412 uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3413 machine_switch_perfcontrol_context(event, timestamp, 0,
3414 same_pri_latency, old, new);
3415 }
3416
3417
3418 /*
3419 * thread_dispatch:
3420 *
3421 * Handle threads at context switch. Re-dispatch other thread
3422 * if still running, otherwise update run state and perform
3423 * special actions. Update quantum for other thread and begin
3424 * the quantum for ourselves.
3425 *
3426 * "thread" is the old thread that we have switched away from.
3427 * "self" is the new current thread that we have context switched to
3428 *
3429 * Called at splsched.
3430 *
3431 */
3432 void
thread_dispatch(thread_t thread,thread_t self)3433 thread_dispatch(
3434 thread_t thread,
3435 thread_t self)
3436 {
3437 processor_t processor = self->last_processor;
3438 bool was_idle = false;
3439
3440 assert(processor == current_processor());
3441 assert(self == current_thread_volatile());
3442 assert(thread != self);
3443
3444 if (thread != THREAD_NULL) {
3445 /*
3446 * Do the perfcontrol callout for context switch.
3447 * The reason we do this here is:
3448 * - thread_dispatch() is called from various places that are not
3449 * the direct context switch path for eg. processor shutdown etc.
3450 * So adding the callout here covers all those cases.
3451 * - We want this callout as early as possible to be close
3452 * to the timestamp taken in thread_invoke()
3453 * - We want to avoid holding the thread lock while doing the
3454 * callout
3455 * - We do not want to callout if "thread" is NULL.
3456 */
3457 thread_csw_callout(thread, self, processor->last_dispatch);
3458
3459 #if KASAN
3460 if (thread->continuation != NULL) {
3461 /*
3462 * Thread has a continuation and the normal stack is going away.
3463 * Unpoison the stack and mark all fakestack objects as unused.
3464 */
3465 #if KASAN_CLASSIC
3466 kasan_fakestack_drop(thread);
3467 #endif /* KASAN_CLASSIC */
3468 if (thread->kernel_stack) {
3469 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3470 }
3471 }
3472
3473
3474 #if KASAN_CLASSIC
3475 /*
3476 * Free all unused fakestack objects.
3477 */
3478 kasan_fakestack_gc(thread);
3479 #endif /* KASAN_CLASSIC */
3480 #endif /* KASAN */
3481
3482 /*
3483 * If blocked at a continuation, discard
3484 * the stack.
3485 */
3486 if (thread->continuation != NULL && thread->kernel_stack != 0) {
3487 stack_free(thread);
3488 }
3489
3490 if (thread->state & TH_IDLE) {
3491 was_idle = true;
3492 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3493 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3494 (uintptr_t)thread_tid(thread), 0, thread->state,
3495 sched_run_buckets[TH_BUCKET_RUN], 0);
3496 } else {
3497 int64_t consumed;
3498 int64_t remainder = 0;
3499
3500 if (processor->quantum_end > processor->last_dispatch) {
3501 remainder = processor->quantum_end -
3502 processor->last_dispatch;
3503 }
3504
3505 consumed = thread->quantum_remaining - remainder;
3506
3507 if ((thread->reason & AST_LEDGER) == 0) {
3508 /*
3509 * Bill CPU time to both the task and
3510 * the individual thread.
3511 */
3512 ledger_credit_thread(thread, thread->t_ledger,
3513 task_ledgers.cpu_time, consumed);
3514 ledger_credit_thread(thread, thread->t_threadledger,
3515 thread_ledgers.cpu_time, consumed);
3516 if (thread->t_bankledger) {
3517 ledger_credit_thread(thread, thread->t_bankledger,
3518 bank_ledgers.cpu_time,
3519 (consumed - thread->t_deduct_bank_ledger_time));
3520 }
3521 thread->t_deduct_bank_ledger_time = 0;
3522 if (consumed > 0) {
3523 /*
3524 * This should never be negative, but in traces we are seeing some instances
3525 * of consumed being negative.
3526 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3527 */
3528 sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3529 }
3530 }
3531
3532 /* For the thread that we just context switched away from, figure
3533 * out if we have expired the wq quantum and set the AST if we have
3534 */
3535 if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3536 thread_evaluate_workqueue_quantum_expiry(thread);
3537 }
3538
3539 if (__improbable(thread->rwlock_count != 0)) {
3540 smr_mark_active_trackers_stalled(thread);
3541 }
3542
3543 /*
3544 * Pairs with task_restartable_ranges_synchronize
3545 */
3546 wake_lock(thread);
3547 thread_lock(thread);
3548
3549 /*
3550 * Same as ast_check(), in case we missed the IPI
3551 */
3552 thread_reset_pcs_ack_IPI(thread);
3553
3554 /*
3555 * Apply a priority floor if the thread holds a kernel resource
3556 * or explicitly requested it.
3557 * Do this before checking starting_pri to avoid overpenalizing
3558 * repeated rwlock blockers.
3559 */
3560 if (__improbable(thread->rwlock_count != 0)) {
3561 lck_rw_set_promotion_locked(thread);
3562 }
3563 if (__improbable(thread->priority_floor_count != 0)) {
3564 thread_floor_boost_set_promotion_locked(thread);
3565 }
3566
3567 boolean_t keep_quantum = processor->first_timeslice;
3568
3569 /*
3570 * Treat a thread which has dropped priority since it got on core
3571 * as having expired its quantum.
3572 */
3573 if (processor->starting_pri > thread->sched_pri) {
3574 keep_quantum = FALSE;
3575 }
3576
3577 /* Compute remainder of current quantum. */
3578 if (keep_quantum &&
3579 processor->quantum_end > processor->last_dispatch) {
3580 thread->quantum_remaining = (uint32_t)remainder;
3581 } else {
3582 thread->quantum_remaining = 0;
3583 }
3584
3585 if (thread->sched_mode == TH_MODE_REALTIME) {
3586 /*
3587 * Cancel the deadline if the thread has
3588 * consumed the entire quantum.
3589 */
3590 if (thread->quantum_remaining == 0) {
3591 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3592 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3593 thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3594 }
3595 } else {
3596 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3597 /*
3598 * For non-realtime threads treat a tiny
3599 * remaining quantum as an expired quantum
3600 * but include what's left next time.
3601 */
3602 if (thread->quantum_remaining < min_std_quantum) {
3603 thread->reason |= AST_QUANTUM;
3604 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3605 }
3606 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3607 }
3608
3609 /*
3610 * If we are doing a direct handoff then
3611 * take the remainder of the quantum.
3612 */
3613 if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3614 self->quantum_remaining = thread->quantum_remaining;
3615 thread->reason |= AST_QUANTUM;
3616 thread->quantum_remaining = 0;
3617 } else {
3618 #if defined(CONFIG_SCHED_MULTIQ)
3619 if (SCHED(sched_groups_enabled) &&
3620 thread->sched_group == self->sched_group) {
3621 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3622 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3623 self->reason, (uintptr_t)thread_tid(thread),
3624 self->quantum_remaining, thread->quantum_remaining, 0);
3625
3626 self->quantum_remaining = thread->quantum_remaining;
3627 thread->quantum_remaining = 0;
3628 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
3629 }
3630 #endif /* defined(CONFIG_SCHED_MULTIQ) */
3631 }
3632
3633 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3634
3635 if (!(thread->state & TH_WAIT)) {
3636 /*
3637 * Still runnable.
3638 */
3639 thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3640
3641 machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3642
3643 ast_t reason = thread->reason;
3644 sched_options_t options = SCHED_NONE;
3645
3646 if (reason & AST_REBALANCE) {
3647 options |= SCHED_REBALANCE;
3648 if (reason & AST_QUANTUM) {
3649 /*
3650 * Having gone to the trouble of forcing this thread off a less preferred core,
3651 * we should force the preferable core to reschedule immediately to give this
3652 * thread a chance to run instead of just sitting on the run queue where
3653 * it may just be stolen back by the idle core we just forced it off.
3654 * But only do this at the end of a quantum to prevent cascading effects.
3655 */
3656 options |= SCHED_PREEMPT;
3657 }
3658 }
3659
3660 if (reason & AST_QUANTUM) {
3661 options |= SCHED_TAILQ;
3662 } else if (reason & AST_PREEMPT) {
3663 options |= SCHED_HEADQ;
3664 } else {
3665 options |= (SCHED_PREEMPT | SCHED_TAILQ);
3666 }
3667
3668 thread_setrun(thread, options);
3669
3670 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3671 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3672 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3673 sched_run_buckets[TH_BUCKET_RUN], 0);
3674
3675 if (thread->wake_active) {
3676 thread->wake_active = FALSE;
3677 thread_unlock(thread);
3678
3679 thread_wakeup(&thread->wake_active);
3680 } else {
3681 thread_unlock(thread);
3682 }
3683
3684 wake_unlock(thread);
3685 } else {
3686 /*
3687 * Waiting.
3688 */
3689 boolean_t should_terminate = FALSE;
3690 uint32_t new_run_count;
3691 int thread_state = thread->state;
3692
3693 /* Only the first call to thread_dispatch
3694 * after explicit termination should add
3695 * the thread to the termination queue
3696 */
3697 if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3698 should_terminate = TRUE;
3699 thread_state |= TH_TERMINATE2;
3700 }
3701
3702 timer_stop(&thread->runnable_timer, processor->last_dispatch);
3703
3704 thread_state &= ~TH_RUN;
3705 thread->state = thread_state;
3706
3707 thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3708 thread->chosen_processor = PROCESSOR_NULL;
3709
3710 new_run_count = SCHED(run_count_decr)(thread);
3711
3712 #if CONFIG_SCHED_AUTO_JOIN
3713 if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3714 work_interval_auto_join_unwind(thread);
3715 }
3716 #endif /* CONFIG_SCHED_AUTO_JOIN */
3717
3718 #if CONFIG_SCHED_SFI
3719 if (thread->reason & AST_SFI) {
3720 thread->wait_sfi_begin_time = processor->last_dispatch;
3721 }
3722 #endif
3723 machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3724
3725 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3726 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3727 (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3728 new_run_count, 0);
3729
3730 if (thread_state & TH_WAIT_REPORT) {
3731 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3732 }
3733
3734 if (thread->wake_active) {
3735 thread->wake_active = FALSE;
3736 thread_unlock(thread);
3737
3738 thread_wakeup(&thread->wake_active);
3739 } else {
3740 thread_unlock(thread);
3741 }
3742
3743 wake_unlock(thread);
3744
3745 if (should_terminate) {
3746 thread_terminate_enqueue(thread);
3747 }
3748 }
3749 }
3750 /*
3751 * The thread could have been added to the termination queue, so it's
3752 * unsafe to use after this point.
3753 */
3754 thread = THREAD_NULL;
3755 }
3756
3757 int urgency = THREAD_URGENCY_NONE;
3758 uint64_t latency = 0;
3759
3760 /* Update (new) current thread and reprogram running timers */
3761 thread_lock(self);
3762
3763 if (!(self->state & TH_IDLE)) {
3764 uint64_t arg1, arg2;
3765
3766 #if CONFIG_SCHED_SFI
3767 ast_t new_ast;
3768
3769 new_ast = sfi_thread_needs_ast(self, NULL);
3770
3771 if (new_ast != AST_NONE) {
3772 ast_on(new_ast);
3773 }
3774 #endif
3775
3776 if (processor->last_dispatch < self->last_made_runnable_time) {
3777 panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3778 processor->last_dispatch, self->last_made_runnable_time);
3779 }
3780
3781 assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3782
3783 latency = processor->last_dispatch - self->last_made_runnable_time;
3784 assert(latency >= self->same_pri_latency);
3785
3786 urgency = thread_get_urgency(self, &arg1, &arg2);
3787
3788 thread_tell_urgency(urgency, arg1, arg2, latency, self);
3789
3790 /*
3791 * Start a new CPU limit interval if the previous one has
3792 * expired. This should happen before initializing a new
3793 * quantum.
3794 */
3795 if (cpulimit_affects_quantum &&
3796 thread_cpulimit_interval_has_expired(processor->last_dispatch)) {
3797 thread_cpulimit_restart(processor->last_dispatch);
3798 }
3799
3800 /*
3801 * Get a new quantum if none remaining.
3802 */
3803 if (self->quantum_remaining == 0) {
3804 thread_quantum_init(self, processor->last_dispatch);
3805 }
3806
3807 /*
3808 * Set up quantum timer and timeslice.
3809 */
3810 processor->quantum_end = processor->last_dispatch +
3811 self->quantum_remaining;
3812
3813 running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
3814 processor->quantum_end, processor->last_dispatch);
3815 if (was_idle) {
3816 /*
3817 * kperf's running timer is active whenever the idle thread for a
3818 * CPU is not running.
3819 */
3820 kperf_running_setup(processor, processor->last_dispatch);
3821 }
3822 running_timers_activate(processor);
3823 processor->first_timeslice = TRUE;
3824 } else {
3825 running_timers_deactivate(processor);
3826 processor->first_timeslice = FALSE;
3827 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
3828 }
3829
3830 assert(self->block_hint == kThreadWaitNone);
3831 self->computation_epoch = processor->last_dispatch;
3832 /*
3833 * This relies on the interrupt time being tallied up to the thread in the
3834 * exception handler epilogue, which is before AST context where preemption
3835 * is considered (and the scheduler is potentially invoked to
3836 * context switch, here).
3837 */
3838 self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach();
3839 self->reason = AST_NONE;
3840 processor->starting_pri = self->sched_pri;
3841
3842 thread_unlock(self);
3843
3844 machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
3845 processor->last_dispatch);
3846
3847 #if defined(CONFIG_SCHED_DEFERRED_AST)
3848 /*
3849 * TODO: Can we state that redispatching our old thread is also
3850 * uninteresting?
3851 */
3852 if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3853 pset_cancel_deferred_dispatch(processor->processor_set, processor);
3854 }
3855 #endif
3856 }
3857
3858 /*
3859 * thread_block_reason:
3860 *
3861 * Forces a reschedule, blocking the caller if a wait
3862 * has been asserted.
3863 *
3864 * If a continuation is specified, then thread_invoke will
3865 * attempt to discard the thread's kernel stack. When the
3866 * thread resumes, it will execute the continuation function
3867 * on a new kernel stack.
3868 */
3869 wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)3870 thread_block_reason(
3871 thread_continue_t continuation,
3872 void *parameter,
3873 ast_t reason)
3874 {
3875 thread_t self = current_thread();
3876 processor_t processor;
3877 thread_t new_thread;
3878 spl_t s;
3879
3880 s = splsched();
3881
3882 processor = current_processor();
3883
3884 /* If we're explicitly yielding, force a subsequent quantum */
3885 if (reason & AST_YIELD) {
3886 processor->first_timeslice = FALSE;
3887 }
3888
3889 /* We're handling all scheduling AST's */
3890 ast_off(AST_SCHEDULING);
3891
3892 clear_pending_nonurgent_preemption(processor);
3893
3894 #if PROC_REF_DEBUG
3895 if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3896 uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3897 }
3898 #endif
3899
3900 self->continuation = continuation;
3901 self->parameter = parameter;
3902
3903 if (self->state & ~(TH_RUN | TH_IDLE)) {
3904 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3905 MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3906 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3907 }
3908
3909 do {
3910 thread_lock(self);
3911 new_thread = thread_select(self, processor, &reason);
3912 thread_unlock(self);
3913 } while (!thread_invoke(self, new_thread, reason));
3914
3915 splx(s);
3916
3917 return self->wait_result;
3918 }
3919
3920 /*
3921 * thread_block:
3922 *
3923 * Block the current thread if a wait has been asserted.
3924 */
3925 wait_result_t
thread_block(thread_continue_t continuation)3926 thread_block(
3927 thread_continue_t continuation)
3928 {
3929 return thread_block_reason(continuation, NULL, AST_NONE);
3930 }
3931
3932 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)3933 thread_block_parameter(
3934 thread_continue_t continuation,
3935 void *parameter)
3936 {
3937 return thread_block_reason(continuation, parameter, AST_NONE);
3938 }
3939
3940 /*
3941 * thread_run:
3942 *
3943 * Switch directly from the current thread to the
3944 * new thread, handing off our quantum if appropriate.
3945 *
3946 * New thread must be runnable, and not on a run queue.
3947 *
3948 * Called at splsched.
3949 */
3950 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)3951 thread_run(
3952 thread_t self,
3953 thread_continue_t continuation,
3954 void *parameter,
3955 thread_t new_thread)
3956 {
3957 ast_t reason = AST_NONE;
3958
3959 if ((self->state & TH_IDLE) == 0) {
3960 reason = AST_HANDOFF;
3961 }
3962
3963 /*
3964 * If this thread hadn't been setrun'ed, it
3965 * might not have a chosen processor, so give it one
3966 */
3967 if (new_thread->chosen_processor == NULL) {
3968 new_thread->chosen_processor = current_processor();
3969 }
3970
3971 self->continuation = continuation;
3972 self->parameter = parameter;
3973
3974 while (!thread_invoke(self, new_thread, reason)) {
3975 /* the handoff failed, so we have to fall back to the normal block path */
3976 processor_t processor = current_processor();
3977
3978 reason = AST_NONE;
3979
3980 thread_lock(self);
3981 new_thread = thread_select(self, processor, &reason);
3982 thread_unlock(self);
3983 }
3984
3985 return self->wait_result;
3986 }
3987
3988 /*
3989 * thread_continue:
3990 *
3991 * Called at splsched when a thread first receives
3992 * a new stack after a continuation.
3993 *
3994 * Called with THREAD_NULL as the old thread when
3995 * invoked by machine_load_context.
3996 */
3997 void
thread_continue(thread_t thread)3998 thread_continue(
3999 thread_t thread)
4000 {
4001 thread_t self = current_thread();
4002 thread_continue_t continuation;
4003 void *parameter;
4004
4005 DTRACE_SCHED(on__cpu);
4006
4007 continuation = self->continuation;
4008 parameter = self->parameter;
4009
4010 assert(continuation != NULL);
4011
4012 #if KPERF
4013 kperf_on_cpu(self, continuation, NULL);
4014 #endif
4015
4016 thread_dispatch(thread, self);
4017
4018 self->continuation = self->parameter = NULL;
4019
4020 #if SCHED_HYGIENE_DEBUG
4021 /* Reset interrupt-masked spin debugging timeout */
4022 ml_spin_debug_clear(self);
4023 #endif
4024
4025 TLOG(1, "thread_continue: calling call_continuation\n");
4026
4027 boolean_t enable_interrupts = TRUE;
4028
4029 /* bootstrap thread, idle thread need to stay interrupts-disabled */
4030 if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
4031 enable_interrupts = FALSE;
4032 }
4033
4034 #if KASAN_TBI
4035 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
4036 #endif /* KASAN_TBI */
4037
4038
4039 call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
4040 /*NOTREACHED*/
4041 }
4042
4043 void
thread_quantum_init(thread_t thread,uint64_t now)4044 thread_quantum_init(thread_t thread, uint64_t now)
4045 {
4046 uint64_t new_quantum = 0;
4047
4048 switch (thread->sched_mode) {
4049 case TH_MODE_REALTIME:
4050 new_quantum = thread->realtime.computation;
4051 new_quantum = MIN(new_quantum, max_unsafe_rt_computation);
4052 break;
4053
4054 case TH_MODE_FIXED:
4055 new_quantum = SCHED(initial_quantum_size)(thread);
4056 new_quantum = MIN(new_quantum, max_unsafe_fixed_computation);
4057 break;
4058
4059 default:
4060 new_quantum = SCHED(initial_quantum_size)(thread);
4061 break;
4062 }
4063
4064 if (cpulimit_affects_quantum) {
4065 const uint64_t cpulimit_remaining = thread_cpulimit_remaining(now);
4066
4067 /*
4068 * If there's no remaining CPU time, the ledger system will
4069 * notice and put the thread to sleep.
4070 */
4071 if (cpulimit_remaining > 0) {
4072 new_quantum = MIN(new_quantum, cpulimit_remaining);
4073 }
4074 }
4075
4076 assert3u(new_quantum, <, UINT32_MAX);
4077 assert3u(new_quantum, >, 0);
4078
4079 thread->quantum_remaining = (uint32_t)new_quantum;
4080 }
4081
4082 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)4083 sched_timeshare_initial_quantum_size(thread_t thread)
4084 {
4085 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
4086 return bg_quantum;
4087 } else {
4088 return std_quantum;
4089 }
4090 }
4091
4092 /*
4093 * run_queue_init:
4094 *
4095 * Initialize a run queue before first use.
4096 */
4097 void
run_queue_init(run_queue_t rq)4098 run_queue_init(
4099 run_queue_t rq)
4100 {
4101 rq->highq = NOPRI;
4102 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
4103 rq->bitmap[i] = 0;
4104 }
4105 rq->urgency = rq->count = 0;
4106 for (int i = 0; i < NRQS; i++) {
4107 circle_queue_init(&rq->queues[i]);
4108 }
4109 }
4110
4111 /*
4112 * run_queue_dequeue:
4113 *
4114 * Perform a dequeue operation on a run queue,
4115 * and return the resulting thread.
4116 *
4117 * The run queue must be locked (see thread_run_queue_remove()
4118 * for more info), and not empty.
4119 */
4120 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)4121 run_queue_dequeue(
4122 run_queue_t rq,
4123 sched_options_t options)
4124 {
4125 thread_t thread;
4126 circle_queue_t queue = &rq->queues[rq->highq];
4127
4128 if (options & SCHED_HEADQ) {
4129 thread = cqe_dequeue_head(queue, struct thread, runq_links);
4130 } else {
4131 thread = cqe_dequeue_tail(queue, struct thread, runq_links);
4132 }
4133
4134 assert(thread != THREAD_NULL);
4135 assert_thread_magic(thread);
4136
4137 thread->runq = PROCESSOR_NULL;
4138 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4139 rq->count--;
4140 if (SCHED(priority_is_urgent)(rq->highq)) {
4141 rq->urgency--; assert(rq->urgency >= 0);
4142 }
4143 if (circle_queue_empty(queue)) {
4144 bitmap_clear(rq->bitmap, rq->highq);
4145 rq->highq = bitmap_first(rq->bitmap, NRQS);
4146 }
4147
4148 return thread;
4149 }
4150
4151 /*
4152 * run_queue_enqueue:
4153 *
4154 * Perform a enqueue operation on a run queue.
4155 *
4156 * The run queue must be locked (see thread_run_queue_remove()
4157 * for more info).
4158 */
4159 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)4160 run_queue_enqueue(
4161 run_queue_t rq,
4162 thread_t thread,
4163 sched_options_t options)
4164 {
4165 circle_queue_t queue = &rq->queues[thread->sched_pri];
4166 boolean_t result = FALSE;
4167
4168 assert_thread_magic(thread);
4169
4170 if (circle_queue_empty(queue)) {
4171 circle_enqueue_tail(queue, &thread->runq_links);
4172
4173 rq_bitmap_set(rq->bitmap, thread->sched_pri);
4174 if (thread->sched_pri > rq->highq) {
4175 rq->highq = thread->sched_pri;
4176 result = TRUE;
4177 }
4178 } else {
4179 if (options & SCHED_TAILQ) {
4180 circle_enqueue_tail(queue, &thread->runq_links);
4181 } else {
4182 circle_enqueue_head(queue, &thread->runq_links);
4183 }
4184 }
4185 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4186 rq->urgency++;
4187 }
4188 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4189 rq->count++;
4190
4191 return result;
4192 }
4193
4194 /*
4195 * run_queue_remove:
4196 *
4197 * Remove a specific thread from a runqueue.
4198 *
4199 * The run queue must be locked.
4200 */
4201 void
run_queue_remove(run_queue_t rq,thread_t thread)4202 run_queue_remove(
4203 run_queue_t rq,
4204 thread_t thread)
4205 {
4206 circle_queue_t queue = &rq->queues[thread->sched_pri];
4207
4208 assert(thread->runq != PROCESSOR_NULL);
4209 assert_thread_magic(thread);
4210
4211 circle_dequeue(queue, &thread->runq_links);
4212 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4213 rq->count--;
4214 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4215 rq->urgency--; assert(rq->urgency >= 0);
4216 }
4217
4218 if (circle_queue_empty(queue)) {
4219 /* update run queue status */
4220 bitmap_clear(rq->bitmap, thread->sched_pri);
4221 rq->highq = bitmap_first(rq->bitmap, NRQS);
4222 }
4223
4224 thread->runq = PROCESSOR_NULL;
4225 }
4226
4227 /*
4228 * run_queue_peek
4229 *
4230 * Peek at the runq and return the highest
4231 * priority thread from the runq.
4232 *
4233 * The run queue must be locked.
4234 */
4235 thread_t
run_queue_peek(run_queue_t rq)4236 run_queue_peek(
4237 run_queue_t rq)
4238 {
4239 if (rq->count > 0) {
4240 circle_queue_t queue = &rq->queues[rq->highq];
4241 thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4242 assert_thread_magic(thread);
4243 return thread;
4244 } else {
4245 return THREAD_NULL;
4246 }
4247 }
4248
4249 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)4250 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4251 {
4252 int pri = thread->sched_pri;
4253 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4254 int i = pri - BASEPRI_RTQUEUES;
4255 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4256 bitmap_t *map = rt_run_queue->bitmap;
4257
4258 bitmap_set(map, i);
4259
4260 queue_t queue = &rt_runq->pri_queue;
4261 uint64_t deadline = thread->realtime.deadline;
4262 bool preempt = false;
4263 bool earliest = false;
4264
4265 if (queue_empty(queue)) {
4266 enqueue_tail(queue, &thread->runq_links);
4267 preempt = true;
4268 earliest = true;
4269 rt_runq->pri_earliest_deadline = deadline;
4270 rt_runq->pri_constraint = thread->realtime.constraint;
4271 } else {
4272 /* Insert into rt_runq in thread deadline order */
4273 queue_entry_t iter;
4274 qe_foreach(iter, queue) {
4275 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4276 assert_thread_magic(iter_thread);
4277
4278 if (deadline < iter_thread->realtime.deadline) {
4279 if (iter == queue_first(queue)) {
4280 preempt = true;
4281 earliest = true;
4282 rt_runq->pri_earliest_deadline = deadline;
4283 rt_runq->pri_constraint = thread->realtime.constraint;
4284 }
4285 insque(&thread->runq_links, queue_prev(iter));
4286 break;
4287 } else if (iter == queue_last(queue)) {
4288 enqueue_tail(queue, &thread->runq_links);
4289 break;
4290 }
4291 }
4292 }
4293 if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4294 os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4295 os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4296 os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4297 }
4298
4299 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4300 rt_runq->pri_count++;
4301 os_atomic_inc(&rt_run_queue->count, relaxed);
4302
4303 thread->runq = processor;
4304
4305 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4306
4307 return preempt;
4308 }
4309
4310 static thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)4311 rt_runq_dequeue(rt_queue_t rt_run_queue)
4312 {
4313 bitmap_t *map = rt_run_queue->bitmap;
4314 int i = bitmap_first(map, NRTQS);
4315 assert((i >= 0) && (i < NRTQS));
4316
4317 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4318
4319 if (!sched_rt_runq_strict_priority) {
4320 int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4321 if (ed_index != i) {
4322 assert((ed_index >= 0) && (ed_index < NRTQS));
4323 rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4324
4325 thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4326 thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4327
4328 if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4329 /* choose the earliest deadline thread */
4330 rt_runq = ed_runq;
4331 i = ed_index;
4332 }
4333 }
4334 }
4335
4336 assert(rt_runq->pri_count > 0);
4337 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4338 uint32_t constraint = RT_CONSTRAINT_NONE;
4339 int ed_index = NOPRI;
4340 thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4341 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4342 if (--rt_runq->pri_count > 0) {
4343 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4344 assert(next_rt != THREAD_NULL);
4345 earliest_deadline = next_rt->realtime.deadline;
4346 constraint = next_rt->realtime.constraint;
4347 ed_index = i;
4348 } else {
4349 bitmap_clear(map, i);
4350 }
4351 rt_runq->pri_earliest_deadline = earliest_deadline;
4352 rt_runq->pri_constraint = constraint;
4353
4354 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4355 rt_runq = &rt_run_queue->rt_queue_pri[i];
4356 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4357 earliest_deadline = rt_runq->pri_earliest_deadline;
4358 constraint = rt_runq->pri_constraint;
4359 ed_index = i;
4360 }
4361 }
4362 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4363 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4364 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4365 os_atomic_dec(&rt_run_queue->count, relaxed);
4366
4367 new_thread->runq = PROCESSOR_NULL;
4368
4369 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4370
4371 return new_thread;
4372 }
4373
4374 static thread_t
rt_runq_first(rt_queue_t rt_run_queue)4375 rt_runq_first(rt_queue_t rt_run_queue)
4376 {
4377 bitmap_t *map = rt_run_queue->bitmap;
4378 int i = bitmap_first(map, NRTQS);
4379 if (i < 0) {
4380 return THREAD_NULL;
4381 }
4382 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4383 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4384
4385 return next_rt;
4386 }
4387
4388 static void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)4389 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4390 {
4391 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4392
4393 int pri = thread->sched_pri;
4394 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4395 int i = pri - BASEPRI_RTQUEUES;
4396 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4397 bitmap_t *map = rt_run_queue->bitmap;
4398
4399 assert(rt_runq->pri_count > 0);
4400 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4401 uint32_t constraint = RT_CONSTRAINT_NONE;
4402 int ed_index = NOPRI;
4403 remqueue(&thread->runq_links);
4404 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4405 if (--rt_runq->pri_count > 0) {
4406 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4407 earliest_deadline = next_rt->realtime.deadline;
4408 constraint = next_rt->realtime.constraint;
4409 ed_index = i;
4410 } else {
4411 bitmap_clear(map, i);
4412 }
4413 rt_runq->pri_earliest_deadline = earliest_deadline;
4414 rt_runq->pri_constraint = constraint;
4415
4416 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4417 rt_runq = &rt_run_queue->rt_queue_pri[i];
4418 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4419 earliest_deadline = rt_runq->pri_earliest_deadline;
4420 constraint = rt_runq->pri_constraint;
4421 ed_index = i;
4422 }
4423 }
4424 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4425 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4426 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4427 os_atomic_dec(&rt_run_queue->count, relaxed);
4428
4429 thread->runq = PROCESSOR_NULL;
4430
4431 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4432 }
4433
4434 rt_queue_t
sched_rtlocal_runq(processor_set_t pset)4435 sched_rtlocal_runq(processor_set_t pset)
4436 {
4437 return &pset->rt_runq;
4438 }
4439
4440 void
sched_rtlocal_init(processor_set_t pset)4441 sched_rtlocal_init(processor_set_t pset)
4442 {
4443 pset_rt_init(pset);
4444 }
4445
4446 void
sched_rtlocal_queue_shutdown(processor_t processor)4447 sched_rtlocal_queue_shutdown(processor_t processor)
4448 {
4449 processor_set_t pset = processor->processor_set;
4450 thread_t thread;
4451 queue_head_t tqueue;
4452
4453 pset_lock(pset);
4454
4455 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4456 if (bit_count(pset_available_cpumap(pset)) > 0) {
4457 pset_unlock(pset);
4458 return;
4459 }
4460
4461 queue_init(&tqueue);
4462
4463 while (rt_runq_count(pset) > 0) {
4464 thread = rt_runq_dequeue(&pset->rt_runq);
4465 enqueue_tail(&tqueue, &thread->runq_links);
4466 }
4467 sched_update_pset_load_average(pset, 0);
4468 pset_update_rt_stealable_state(pset);
4469 pset_unlock(pset);
4470
4471 qe_foreach_element_safe(thread, &tqueue, runq_links) {
4472 remqueue(&thread->runq_links);
4473
4474 thread_lock(thread);
4475
4476 thread_setrun(thread, SCHED_TAILQ);
4477
4478 thread_unlock(thread);
4479 }
4480 }
4481
4482 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4483 void
sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)4484 sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4485 {
4486 thread_t thread;
4487
4488 pset_node_t node = &pset_node0;
4489 processor_set_t pset = node->psets;
4490
4491 spl_t s = splsched();
4492 do {
4493 while (pset != NULL) {
4494 pset_lock(pset);
4495
4496 bitmap_t *map = pset->rt_runq.bitmap;
4497 for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4498 rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4499
4500 qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4501 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4502 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4503 }
4504 }
4505 }
4506
4507 pset_unlock(pset);
4508
4509 pset = pset->pset_list;
4510 }
4511 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4512 splx(s);
4513 }
4514
4515 int64_t
sched_rtlocal_runq_count_sum(void)4516 sched_rtlocal_runq_count_sum(void)
4517 {
4518 pset_node_t node = &pset_node0;
4519 processor_set_t pset = node->psets;
4520 int64_t count = 0;
4521
4522 do {
4523 while (pset != NULL) {
4524 count += pset->rt_runq.runq_stats.count_sum;
4525
4526 pset = pset->pset_list;
4527 }
4528 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4529
4530 return count;
4531 }
4532
4533 /*
4534 * Called with stealing_pset locked and
4535 * returns with stealing_pset locked
4536 * but the lock will have been dropped
4537 * if a thread is returned.
4538 */
4539 thread_t
sched_rtlocal_steal_thread(processor_set_t stealing_pset,uint64_t earliest_deadline)4540 sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4541 {
4542 if (!sched_allow_rt_steal) {
4543 return THREAD_NULL;
4544 }
4545 pset_map_t pset_map = stealing_pset->node->pset_map;
4546
4547 bit_clear(pset_map, stealing_pset->pset_id);
4548
4549 processor_set_t pset = stealing_pset;
4550
4551 processor_set_t target_pset;
4552 uint64_t target_deadline;
4553
4554 retry:
4555 target_pset = NULL;
4556 target_deadline = earliest_deadline - rt_deadline_epsilon;
4557
4558 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
4559 processor_set_t nset = pset_array[pset_id];
4560
4561 /*
4562 * During startup, while pset_array[] and node->pset_map are still being initialized,
4563 * the update to pset_map may become visible to this cpu before the update to pset_array[].
4564 * It would be good to avoid inserting a memory barrier here that is only needed during startup,
4565 * so just check nset is not NULL instead.
4566 */
4567 if (nset && (nset->stealable_rt_threads_earliest_deadline < target_deadline)) {
4568 target_deadline = nset->stealable_rt_threads_earliest_deadline;
4569 target_pset = nset;
4570 }
4571 }
4572
4573 if (target_pset != NULL) {
4574 pset = change_locked_pset(pset, target_pset);
4575 if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4576 thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
4577 pset_update_rt_stealable_state(pset);
4578 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4579
4580 pset = change_locked_pset(pset, stealing_pset);
4581 return new_thread;
4582 }
4583 pset = change_locked_pset(pset, stealing_pset);
4584 earliest_deadline = rt_runq_earliest_deadline(pset);
4585 goto retry;
4586 }
4587
4588 pset = change_locked_pset(pset, stealing_pset);
4589 return THREAD_NULL;
4590 }
4591
4592 /*
4593 * pset is locked
4594 */
4595 thread_t
sched_rt_choose_thread(processor_set_t pset)4596 sched_rt_choose_thread(processor_set_t pset)
4597 {
4598 processor_t processor = current_processor();
4599
4600 if (SCHED(steal_thread_enabled)(pset)) {
4601 do {
4602 bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4603 if (spill_pending) {
4604 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4605 }
4606 thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4607 if (new_thread != THREAD_NULL) {
4608 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4609 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4610 }
4611 return new_thread;
4612 }
4613 } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4614 }
4615
4616 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4617 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4618 }
4619
4620 if (rt_runq_count(pset) > 0) {
4621 thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4622 assert(new_thread != THREAD_NULL);
4623 pset_update_rt_stealable_state(pset);
4624 return new_thread;
4625 }
4626
4627 return THREAD_NULL;
4628 }
4629
4630 /*
4631 * realtime_queue_insert:
4632 *
4633 * Enqueue a thread for realtime execution.
4634 */
4635 static bool
realtime_queue_insert(processor_t processor,processor_set_t pset,thread_t thread)4636 realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4637 {
4638 pset_assert_locked(pset);
4639
4640 bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4641 pset_update_rt_stealable_state(pset);
4642
4643 return preempt;
4644 }
4645
4646 /*
4647 * realtime_setrun:
4648 *
4649 * Dispatch a thread for realtime execution.
4650 *
4651 * Thread must be locked. Associated pset must
4652 * be locked, and is returned unlocked.
4653 */
4654 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4655 realtime_setrun(
4656 processor_t chosen_processor,
4657 thread_t thread)
4658 {
4659 processor_set_t pset = chosen_processor->processor_set;
4660 pset_assert_locked(pset);
4661 bool pset_is_locked = true;
4662
4663 int n_backup = 0;
4664
4665 if (thread->realtime.constraint <= rt_constraint_threshold) {
4666 n_backup = sched_rt_n_backup_processors;
4667 }
4668 assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4669
4670 int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4671 if (existing_backups > 0) {
4672 n_backup = n_backup - existing_backups;
4673 if (n_backup < 0) {
4674 n_backup = 0;
4675 }
4676 }
4677
4678 sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4679 processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4680
4681 thread->chosen_processor = chosen_processor;
4682
4683 /* <rdar://problem/15102234> */
4684 assert(thread->bound_processor == PROCESSOR_NULL);
4685
4686 realtime_queue_insert(chosen_processor, pset, thread);
4687
4688 processor_t processor = chosen_processor;
4689
4690 int count = 0;
4691 for (int i = 0; i <= n_backup; i++) {
4692 if (i == 0) {
4693 ipi_type[i] = SCHED_IPI_NONE;
4694 ipi_processor[i] = processor;
4695 count++;
4696
4697 ast_t preempt = AST_NONE;
4698 if (thread->sched_pri > processor->current_pri) {
4699 preempt = (AST_PREEMPT | AST_URGENT);
4700 } else if (thread->sched_pri == processor->current_pri) {
4701 if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4702 preempt = (AST_PREEMPT | AST_URGENT);
4703 }
4704 }
4705
4706 if (preempt != AST_NONE) {
4707 if (processor->state == PROCESSOR_IDLE) {
4708 if (processor == current_processor()) {
4709 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4710 ast_on(preempt);
4711
4712 if ((preempt & AST_URGENT) == AST_URGENT) {
4713 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4714 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4715 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4716 }
4717 }
4718
4719 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4720 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4721 }
4722 } else {
4723 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4724 }
4725 } else if (processor->state == PROCESSOR_DISPATCHING) {
4726 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4727 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4728 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4729 }
4730 } else {
4731 if (processor == current_processor()) {
4732 ast_on(preempt);
4733
4734 if ((preempt & AST_URGENT) == AST_URGENT) {
4735 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4736 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4737 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4738 }
4739 }
4740
4741 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4742 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4743 }
4744 } else {
4745 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4746 }
4747 }
4748 } else {
4749 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4750 }
4751 } else {
4752 if (!pset_is_locked) {
4753 pset_lock(pset);
4754 }
4755 ipi_type[i] = SCHED_IPI_NONE;
4756 ipi_processor[i] = PROCESSOR_NULL;
4757 pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]);
4758 if (ipi_processor[i] == PROCESSOR_NULL) {
4759 break;
4760 }
4761 count++;
4762
4763 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4764 ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4765 #if defined(__x86_64__)
4766 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4767 if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4768 processor_t p0 = ipi_processor[0];
4769 processor_t p1 = ipi_processor[1];
4770 assert(p0 && p1);
4771 if (p_is_good(p0) && p_is_good(p1)) {
4772 /*
4773 * Both the chosen processor and the first backup are non-cpu0 primaries,
4774 * so there is no need for a 2nd backup processor.
4775 */
4776 break;
4777 }
4778 }
4779 #endif
4780 }
4781 }
4782
4783 if (pset_is_locked) {
4784 pset_unlock(pset);
4785 }
4786
4787 assert((count > 0) && (count <= (n_backup + 1)));
4788 for (int i = 0; i < count; i++) {
4789 assert(ipi_processor[i] != PROCESSOR_NULL);
4790 sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4791 }
4792 }
4793
4794
4795 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)4796 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4797 thread_t thread, __unused sched_ipi_event_t event)
4798 {
4799 #if defined(CONFIG_SCHED_DEFERRED_AST)
4800 #if CONFIG_THREAD_GROUPS
4801 if (thread) {
4802 struct thread_group *tg = thread_group_get(thread);
4803 if (thread_group_uses_immediate_ipi(tg)) {
4804 return SCHED_IPI_IMMEDIATE;
4805 }
4806 }
4807 #endif /* CONFIG_THREAD_GROUPS */
4808 if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4809 return SCHED_IPI_DEFERRED;
4810 }
4811 #else /* CONFIG_SCHED_DEFERRED_AST */
4812 (void) thread;
4813 panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4814 #endif /* CONFIG_SCHED_DEFERRED_AST */
4815 return SCHED_IPI_NONE;
4816 }
4817
4818 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)4819 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4820 {
4821 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4822 assert(dst != NULL);
4823
4824 processor_set_t pset = dst->processor_set;
4825 if (current_processor() == dst) {
4826 return SCHED_IPI_NONE;
4827 }
4828
4829 bool dst_idle = (dst->state == PROCESSOR_IDLE);
4830 if (dst_idle) {
4831 pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
4832 }
4833
4834 ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4835 switch (ipi_type) {
4836 case SCHED_IPI_NONE:
4837 return SCHED_IPI_NONE;
4838 #if defined(CONFIG_SCHED_DEFERRED_AST)
4839 case SCHED_IPI_DEFERRED:
4840 bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4841 break;
4842 #endif /* CONFIG_SCHED_DEFERRED_AST */
4843 default:
4844 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4845 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4846 dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4847 }
4848 bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4849 break;
4850 }
4851 return ipi_type;
4852 }
4853
4854 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4855 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4856 {
4857 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4858 boolean_t deferred_ipi_supported = false;
4859 processor_set_t pset = dst->processor_set;
4860
4861 #if defined(CONFIG_SCHED_DEFERRED_AST)
4862 deferred_ipi_supported = true;
4863 #endif /* CONFIG_SCHED_DEFERRED_AST */
4864
4865 switch (event) {
4866 case SCHED_IPI_EVENT_SPILL:
4867 case SCHED_IPI_EVENT_SMT_REBAL:
4868 case SCHED_IPI_EVENT_REBALANCE:
4869 case SCHED_IPI_EVENT_BOUND_THR:
4870 case SCHED_IPI_EVENT_RT_PREEMPT:
4871 /*
4872 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4873 * scenarios use immediate IPIs always.
4874 */
4875 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4876 break;
4877 case SCHED_IPI_EVENT_PREEMPT:
4878 /* In the preemption case, use immediate IPIs for RT threads */
4879 if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4880 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4881 break;
4882 }
4883
4884 /*
4885 * For Non-RT threads preemption,
4886 * If the core is active, use immediate IPIs.
4887 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4888 */
4889 if (deferred_ipi_supported && dst_idle) {
4890 return sched_ipi_deferred_policy(pset, dst, thread, event);
4891 }
4892 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4893 break;
4894 default:
4895 panic("Unrecognized scheduler IPI event type %d", event);
4896 }
4897 assert(ipi_type != SCHED_IPI_NONE);
4898 return ipi_type;
4899 }
4900
4901 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)4902 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4903 {
4904 switch (ipi) {
4905 case SCHED_IPI_NONE:
4906 break;
4907 case SCHED_IPI_IDLE:
4908 machine_signal_idle(dst);
4909 break;
4910 case SCHED_IPI_IMMEDIATE:
4911 cause_ast_check(dst);
4912 break;
4913 case SCHED_IPI_DEFERRED:
4914 machine_signal_idle_deferred(dst);
4915 break;
4916 default:
4917 panic("Unrecognized scheduler IPI type: %d", ipi);
4918 }
4919 }
4920
4921 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4922
4923 boolean_t
priority_is_urgent(int priority)4924 priority_is_urgent(int priority)
4925 {
4926 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
4927 }
4928
4929 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4930
4931 /*
4932 * processor_setrun:
4933 *
4934 * Dispatch a thread for execution on a
4935 * processor.
4936 *
4937 * Thread must be locked. Associated pset must
4938 * be locked, and is returned unlocked.
4939 */
4940 static void
processor_setrun(processor_t processor,thread_t thread,integer_t options)4941 processor_setrun(
4942 processor_t processor,
4943 thread_t thread,
4944 integer_t options)
4945 {
4946 processor_set_t pset = processor->processor_set;
4947 pset_assert_locked(pset);
4948 ast_t preempt = AST_NONE;
4949 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4950
4951 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4952
4953 thread->chosen_processor = processor;
4954
4955 /*
4956 * Set preemption mode.
4957 */
4958 #if defined(CONFIG_SCHED_DEFERRED_AST)
4959 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4960 #endif
4961 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4962 preempt = (AST_PREEMPT | AST_URGENT);
4963 } else if (processor->current_is_eagerpreempt) {
4964 preempt = (AST_PREEMPT | AST_URGENT);
4965 } else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4966 if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4967 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4968 } else {
4969 preempt = AST_NONE;
4970 }
4971 } else {
4972 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4973 }
4974
4975 if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
4976 /*
4977 * Having gone to the trouble of forcing this thread off a less preferred core,
4978 * we should force the preferable core to reschedule immediately to give this
4979 * thread a chance to run instead of just sitting on the run queue where
4980 * it may just be stolen back by the idle core we just forced it off.
4981 */
4982 preempt |= AST_PREEMPT;
4983 }
4984
4985 SCHED(processor_enqueue)(processor, thread, options);
4986 sched_update_pset_load_average(pset, 0);
4987
4988 if (preempt != AST_NONE) {
4989 if (processor->state == PROCESSOR_IDLE) {
4990 ipi_action = eExitIdle;
4991 } else if (processor->state == PROCESSOR_DISPATCHING) {
4992 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4993 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4994 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4995 }
4996 } else if ((processor->state == PROCESSOR_RUNNING ||
4997 processor->state == PROCESSOR_SHUTDOWN) &&
4998 (thread->sched_pri >= processor->current_pri)) {
4999 ipi_action = eInterruptRunning;
5000 }
5001 } else {
5002 /*
5003 * New thread is not important enough to preempt what is running, but
5004 * special processor states may need special handling
5005 */
5006 if (processor->state == PROCESSOR_SHUTDOWN &&
5007 thread->sched_pri >= processor->current_pri) {
5008 ipi_action = eInterruptRunning;
5009 } else if (processor->state == PROCESSOR_IDLE) {
5010 ipi_action = eExitIdle;
5011 } else if (processor->state == PROCESSOR_DISPATCHING) {
5012 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5013 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5014 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
5015 }
5016 }
5017 }
5018
5019 if (ipi_action != eDoNothing) {
5020 if (processor == current_processor()) {
5021 if (ipi_action == eExitIdle) {
5022 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
5023 }
5024 if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
5025 ast_on(preempt);
5026 }
5027
5028 if ((preempt & AST_URGENT) == AST_URGENT) {
5029 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5030 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5031 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
5032 }
5033 } else {
5034 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5035 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
5036 }
5037 }
5038
5039 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
5040 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5041 } else {
5042 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5043 }
5044 } else {
5045 sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
5046 ipi_type = sched_ipi_action(processor, thread, event);
5047 }
5048 }
5049
5050 pset_unlock(pset);
5051 sched_ipi_perform(processor, ipi_type);
5052
5053 if (ipi_action != eDoNothing && processor == current_processor()) {
5054 ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
5055 ast_on(new_preempt);
5056 }
5057 }
5058
5059 /*
5060 * choose_next_pset:
5061 *
5062 * Return the next sibling pset containing
5063 * available processors.
5064 *
5065 * Returns the original pset if none other is
5066 * suitable.
5067 */
5068 static processor_set_t
choose_next_pset(processor_set_t pset)5069 choose_next_pset(
5070 processor_set_t pset)
5071 {
5072 processor_set_t nset = pset;
5073
5074 do {
5075 nset = next_pset(nset);
5076
5077 /*
5078 * Sometimes during startup the pset_map can contain a bit
5079 * for a pset that isn't fully published in pset_array because
5080 * the pset_map read isn't an acquire load.
5081 *
5082 * In order to avoid needing an acquire barrier here, just bail
5083 * out.
5084 */
5085 if (nset == PROCESSOR_SET_NULL) {
5086 return pset;
5087 }
5088 } while (nset->online_processor_count < 1 && nset != pset);
5089
5090 return nset;
5091 }
5092
5093 /*
5094 * choose_processor:
5095 *
5096 * Choose a processor for the thread, beginning at
5097 * the pset. Accepts an optional processor hint in
5098 * the pset.
5099 *
5100 * Returns a processor, possibly from a different pset.
5101 *
5102 * The thread must be locked. The pset must be locked,
5103 * and the resulting pset is locked on return.
5104 */
5105 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread)5106 choose_processor(
5107 processor_set_t starting_pset,
5108 processor_t processor,
5109 thread_t thread)
5110 {
5111 processor_set_t pset = starting_pset;
5112 processor_set_t nset;
5113
5114 assert(thread->sched_pri <= MAXPRI);
5115
5116 /*
5117 * Prefer the hinted processor, when appropriate.
5118 */
5119
5120 /* Fold last processor hint from secondary processor to its primary */
5121 if (processor != PROCESSOR_NULL) {
5122 processor = processor->processor_primary;
5123 }
5124
5125 /*
5126 * Only consult platform layer if pset is active, which
5127 * it may not be in some cases when a multi-set system
5128 * is going to sleep.
5129 */
5130 if (pset->online_processor_count) {
5131 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
5132 processor_t mc_processor = machine_choose_processor(pset, processor);
5133 if (mc_processor != PROCESSOR_NULL) {
5134 processor = mc_processor->processor_primary;
5135 }
5136 }
5137 }
5138
5139 /*
5140 * At this point, we may have a processor hint, and we may have
5141 * an initial starting pset. If the hint is not in the pset, or
5142 * if the hint is for a processor in an invalid state, discard
5143 * the hint.
5144 */
5145 if (processor != PROCESSOR_NULL) {
5146 if (processor->processor_set != pset) {
5147 processor = PROCESSOR_NULL;
5148 } else if (!processor->is_recommended) {
5149 processor = PROCESSOR_NULL;
5150 } else {
5151 switch (processor->state) {
5152 case PROCESSOR_START:
5153 case PROCESSOR_SHUTDOWN:
5154 case PROCESSOR_PENDING_OFFLINE:
5155 case PROCESSOR_OFF_LINE:
5156 /*
5157 * Hint is for a processor that cannot support running new threads.
5158 */
5159 processor = PROCESSOR_NULL;
5160 break;
5161 case PROCESSOR_IDLE:
5162 /*
5163 * Hint is for an idle processor. Assume it is no worse than any other
5164 * idle processor. The platform layer had an opportunity to provide
5165 * the "least cost idle" processor above.
5166 */
5167 if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5168 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5169 uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask;
5170 /*
5171 * Only 1 idle core, choose it.
5172 */
5173 if (bit_count(idle_primary_map) == 1) {
5174 return processor;
5175 }
5176
5177 /*
5178 * If the rotation bitmask to force a migration is set for this core and one of the preferred cores
5179 * is idle, don't continue running on the same core.
5180 */
5181 if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && preferred_idle_primary_map != 0)) {
5182 return processor;
5183 }
5184 }
5185 processor = PROCESSOR_NULL;
5186 break;
5187 case PROCESSOR_RUNNING:
5188 case PROCESSOR_DISPATCHING:
5189 /*
5190 * Hint is for an active CPU. This fast-path allows
5191 * realtime threads to preempt non-realtime threads
5192 * to regain their previous executing processor.
5193 */
5194 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5195 if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5196 return processor;
5197 }
5198 processor = PROCESSOR_NULL;
5199 }
5200
5201 /* Otherwise, use hint as part of search below */
5202 break;
5203 default:
5204 processor = PROCESSOR_NULL;
5205 break;
5206 }
5207 }
5208 }
5209
5210 /*
5211 * Iterate through the processor sets to locate
5212 * an appropriate processor. Seed results with
5213 * a last-processor hint, if available, so that
5214 * a search must find something strictly better
5215 * to replace it.
5216 *
5217 * A primary/secondary pair of SMT processors are
5218 * "unpaired" if the primary is busy but its
5219 * corresponding secondary is idle (so the physical
5220 * core has full use of its resources).
5221 */
5222
5223 integer_t lowest_priority = MAXPRI + 1;
5224 integer_t lowest_secondary_priority = MAXPRI + 1;
5225 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
5226 integer_t lowest_idle_secondary_priority = MAXPRI + 1;
5227 integer_t lowest_count = INT_MAX;
5228 processor_t lp_processor = PROCESSOR_NULL;
5229 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
5230 processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
5231 processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
5232 processor_t lc_processor = PROCESSOR_NULL;
5233
5234 if (processor != PROCESSOR_NULL) {
5235 /* All other states should be enumerated above. */
5236 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5237 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5238
5239 lowest_priority = processor->current_pri;
5240 lp_processor = processor;
5241
5242 lowest_count = SCHED(processor_runq_count)(processor);
5243 lc_processor = processor;
5244 }
5245
5246 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5247 pset_node_t node = pset->node;
5248 bool include_ast_urgent_pending_cpus = false;
5249 cpumap_t ast_urgent_pending;
5250 try_again:
5251 ast_urgent_pending = 0;
5252 int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
5253 for (; consider_secondaries < 2; consider_secondaries++) {
5254 pset = change_locked_pset(pset, starting_pset);
5255 do {
5256 cpumap_t available_map = pset_available_cpumap(pset);
5257 if (available_map == 0) {
5258 goto no_available_cpus;
5259 }
5260
5261 processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5262 if (processor) {
5263 return processor;
5264 }
5265
5266 if (consider_secondaries) {
5267 processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5268 if (processor) {
5269 /*
5270 * Instead of looping through all the psets to find the global
5271 * furthest deadline processor, preempt the first candidate found.
5272 * The preempted thread will then find any other available far deadline
5273 * processors to preempt.
5274 */
5275 return processor;
5276 }
5277
5278 ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5279
5280 if (rt_runq_count(pset) < lowest_count) {
5281 int cpuid = bit_first(available_map);
5282 assert(cpuid >= 0);
5283 lc_processor = processor_array[cpuid];
5284 lowest_count = rt_runq_count(pset);
5285 }
5286 }
5287
5288 no_available_cpus:
5289 nset = next_pset(pset);
5290
5291 if (nset != starting_pset) {
5292 pset = change_locked_pset(pset, nset);
5293 }
5294 } while (nset != starting_pset);
5295 }
5296
5297 /* Short cut for single pset nodes */
5298 if (bit_count(node->pset_map) == 1) {
5299 if (lc_processor) {
5300 pset_assert_locked(lc_processor->processor_set);
5301 return lc_processor;
5302 }
5303 } else {
5304 if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5305 /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5306 include_ast_urgent_pending_cpus = true;
5307 goto try_again;
5308 }
5309 }
5310
5311 processor = lc_processor;
5312
5313 if (processor) {
5314 pset = change_locked_pset(pset, processor->processor_set);
5315 /* Check that chosen processor is still usable */
5316 cpumap_t available_map = pset_available_cpumap(pset);
5317 if (bit_test(available_map, processor->cpu_id)) {
5318 return processor;
5319 }
5320
5321 /* processor is no longer usable */
5322 processor = PROCESSOR_NULL;
5323 }
5324
5325 pset_assert_locked(pset);
5326 pset_unlock(pset);
5327 return PROCESSOR_NULL;
5328 }
5329
5330 /* No realtime threads from this point on */
5331 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5332
5333 do {
5334 /*
5335 * Choose an idle processor, in pset traversal order
5336 */
5337 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5338 uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask;
5339
5340 /* there shouldn't be a pending AST if the processor is idle */
5341 assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5342
5343 /*
5344 * Look at the preferred cores first.
5345 */
5346 int cpuid = lsb_next(preferred_idle_primary_map, pset->cpu_preferred_last_chosen);
5347 if (cpuid < 0) {
5348 cpuid = lsb_first(preferred_idle_primary_map);
5349 }
5350 if (cpuid >= 0) {
5351 processor = processor_array[cpuid];
5352 pset->cpu_preferred_last_chosen = cpuid;
5353 return processor;
5354 }
5355
5356 /*
5357 * Fall back to all idle cores if none of the preferred ones are available.
5358 */
5359 cpuid = lsb_first(idle_primary_map);
5360 if (cpuid >= 0) {
5361 processor = processor_array[cpuid];
5362 return processor;
5363 }
5364
5365 /*
5366 * Otherwise, enumerate active and idle processors to find primary candidates
5367 * with lower priority/etc.
5368 */
5369
5370 uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5371 pset->recommended_bitmask &
5372 ~pset->pending_AST_URGENT_cpu_mask);
5373
5374 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5375 active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5376 }
5377
5378 active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5379 for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5380 cpuid = ((rotid + pset->last_chosen + 1) & 63);
5381 processor = processor_array[cpuid];
5382
5383 integer_t cpri = processor->current_pri;
5384 processor_t primary = processor->processor_primary;
5385 if (primary != processor) {
5386 /* If primary is running a NO_SMT thread, don't choose its secondary */
5387 if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
5388 if (cpri < lowest_secondary_priority) {
5389 lowest_secondary_priority = cpri;
5390 lp_paired_secondary_processor = processor;
5391 }
5392 }
5393 } else {
5394 if (cpri < lowest_priority) {
5395 lowest_priority = cpri;
5396 lp_processor = processor;
5397 }
5398 }
5399
5400 integer_t ccount = SCHED(processor_runq_count)(processor);
5401 if (ccount < lowest_count) {
5402 lowest_count = ccount;
5403 lc_processor = processor;
5404 }
5405 }
5406
5407 /*
5408 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5409 * the idle primary would have short-circuited the loop above
5410 */
5411 uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5412 ~pset->primary_map &
5413 pset->recommended_bitmask);
5414
5415 /* there shouldn't be a pending AST if the processor is idle */
5416 assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5417 assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5418
5419 for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
5420 processor = processor_array[cpuid];
5421
5422 processor_t cprimary = processor->processor_primary;
5423
5424 integer_t primary_pri = cprimary->current_pri;
5425
5426 /*
5427 * TODO: This should also make the same decisions
5428 * as secondary_can_run_realtime_thread
5429 *
5430 * TODO: Keep track of the pending preemption priority
5431 * of the primary to make this more accurate.
5432 */
5433
5434 /* If the primary is running a no-smt thread, then don't choose its secondary */
5435 if (cprimary->state == PROCESSOR_RUNNING &&
5436 processor_active_thread_no_smt(cprimary)) {
5437 continue;
5438 }
5439
5440 /*
5441 * Find the idle secondary processor with the lowest priority primary
5442 *
5443 * We will choose this processor as a fallback if we find no better
5444 * primary to preempt.
5445 */
5446 if (primary_pri < lowest_idle_secondary_priority) {
5447 lp_idle_secondary_processor = processor;
5448 lowest_idle_secondary_priority = primary_pri;
5449 }
5450
5451 /* Find the the lowest priority active primary with idle secondary */
5452 if (primary_pri < lowest_unpaired_primary_priority) {
5453 /* If the primary processor is offline or starting up, it's not a candidate for this path */
5454 if (cprimary->state != PROCESSOR_RUNNING &&
5455 cprimary->state != PROCESSOR_DISPATCHING) {
5456 continue;
5457 }
5458
5459 if (!cprimary->is_recommended) {
5460 continue;
5461 }
5462
5463 /* if the primary is pending preemption, don't try to re-preempt it */
5464 if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5465 continue;
5466 }
5467
5468 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5469 bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5470 continue;
5471 }
5472
5473 lowest_unpaired_primary_priority = primary_pri;
5474 lp_unpaired_primary_processor = cprimary;
5475 }
5476 }
5477
5478 /*
5479 * We prefer preempting a primary processor over waking up its secondary.
5480 * The secondary will then be woken up by the preempted thread.
5481 */
5482 if (thread->sched_pri > lowest_unpaired_primary_priority) {
5483 pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5484 return lp_unpaired_primary_processor;
5485 }
5486
5487 /*
5488 * We prefer preempting a lower priority active processor over directly
5489 * waking up an idle secondary.
5490 * The preempted thread will then find the idle secondary.
5491 */
5492 if (thread->sched_pri > lowest_priority) {
5493 pset->last_chosen = lp_processor->cpu_id;
5494 return lp_processor;
5495 }
5496
5497 /*
5498 * lc_processor is used to indicate the best processor set run queue
5499 * on which to enqueue a thread when all available CPUs are busy with
5500 * higher priority threads, so try to make sure it is initialized.
5501 */
5502 if (lc_processor == PROCESSOR_NULL) {
5503 cpumap_t available_map = pset_available_cpumap(pset);
5504 cpuid = lsb_first(available_map);
5505 if (cpuid >= 0) {
5506 lc_processor = processor_array[cpuid];
5507 lowest_count = SCHED(processor_runq_count)(lc_processor);
5508 }
5509 }
5510
5511 /*
5512 * Move onto the next processor set.
5513 *
5514 * If all primary processors in this pset are running a higher
5515 * priority thread, move on to next pset. Only when we have
5516 * exhausted the search for primary processors do we
5517 * fall back to secondaries.
5518 */
5519 #if CONFIG_SCHED_EDGE
5520 /*
5521 * The edge scheduler expects a CPU to be selected from the pset it passed in
5522 * as the starting pset for non-RT workloads. The edge migration algorithm
5523 * should already have considered idle CPUs and loads to decide the starting_pset;
5524 * which means that this loop can be short-circuted.
5525 */
5526 nset = starting_pset;
5527 #else /* CONFIG_SCHED_EDGE */
5528 nset = next_pset(pset);
5529 #endif /* CONFIG_SCHED_EDGE */
5530
5531 if (nset != starting_pset) {
5532 pset = change_locked_pset(pset, nset);
5533 }
5534 } while (nset != starting_pset);
5535
5536 /*
5537 * Make sure that we pick a running processor,
5538 * and that the correct processor set is locked.
5539 * Since we may have unlocked the candidate processor's
5540 * pset, it may have changed state.
5541 *
5542 * All primary processors are running a higher priority
5543 * thread, so the only options left are enqueuing on
5544 * the secondary processor that would perturb the least priority
5545 * primary, or the least busy primary.
5546 */
5547
5548 /* lowest_priority is evaluated in the main loops above */
5549 if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5550 processor = lp_idle_secondary_processor;
5551 } else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5552 processor = lp_paired_secondary_processor;
5553 } else if (lc_processor != PROCESSOR_NULL) {
5554 processor = lc_processor;
5555 } else {
5556 processor = PROCESSOR_NULL;
5557 }
5558
5559 if (processor) {
5560 pset = change_locked_pset(pset, processor->processor_set);
5561 /* Check that chosen processor is still usable */
5562 cpumap_t available_map = pset_available_cpumap(pset);
5563 if (bit_test(available_map, processor->cpu_id)) {
5564 pset->last_chosen = processor->cpu_id;
5565 return processor;
5566 }
5567
5568 /* processor is no longer usable */
5569 processor = PROCESSOR_NULL;
5570 }
5571
5572 pset_assert_locked(pset);
5573 pset_unlock(pset);
5574 return PROCESSOR_NULL;
5575 }
5576
5577 /*
5578 * Default implementation of SCHED(choose_node)()
5579 * for single node systems
5580 */
5581 pset_node_t
sched_choose_node(__unused thread_t thread)5582 sched_choose_node(__unused thread_t thread)
5583 {
5584 return &pset_node0;
5585 }
5586
5587 /*
5588 * choose_starting_pset:
5589 *
5590 * Choose a starting processor set for the thread.
5591 * May return a processor hint within the pset.
5592 *
5593 * Returns a starting processor set, to be used by
5594 * choose_processor.
5595 *
5596 * The thread must be locked. The resulting pset is unlocked on return,
5597 * and is chosen without taking any pset locks.
5598 */
5599 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)5600 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5601 {
5602 processor_set_t pset;
5603 processor_t processor = PROCESSOR_NULL;
5604
5605 if (thread->affinity_set != AFFINITY_SET_NULL) {
5606 /*
5607 * Use affinity set policy hint.
5608 */
5609 pset = thread->affinity_set->aset_pset;
5610 } else if (thread->last_processor != PROCESSOR_NULL) {
5611 /*
5612 * Simple (last processor) affinity case.
5613 */
5614 processor = thread->last_processor;
5615 pset = processor->processor_set;
5616 } else {
5617 /*
5618 * No Affinity case:
5619 *
5620 * Utilitize a per task hint to spread threads
5621 * among the available processor sets.
5622 * NRG this seems like the wrong thing to do.
5623 * See also task->pset_hint = pset in thread_setrun()
5624 */
5625 pset = get_threadtask(thread)->pset_hint;
5626 if (pset == PROCESSOR_SET_NULL) {
5627 pset = current_processor()->processor_set;
5628 }
5629
5630 pset = choose_next_pset(pset);
5631 }
5632
5633 if (!bit_test(node->pset_map, pset->pset_id)) {
5634 /* pset is not from this node so choose one that is */
5635 int id = lsb_first(node->pset_map);
5636 if (id < 0) {
5637 /* startup race, so check again under the node lock */
5638 lck_spin_lock(&pset_node_lock);
5639 if (bit_test(node->pset_map, pset->pset_id)) {
5640 id = pset->pset_id;
5641 } else {
5642 id = lsb_first(node->pset_map);
5643 }
5644 lck_spin_unlock(&pset_node_lock);
5645 }
5646 assert(id >= 0);
5647 pset = pset_array[id];
5648 }
5649
5650 if (bit_count(node->pset_map) == 1) {
5651 /* Only a single pset in this node */
5652 goto out;
5653 }
5654
5655 bool avoid_cpu0 = false;
5656
5657 #if defined(__x86_64__)
5658 if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5659 /* Avoid the pset containing cpu0 */
5660 avoid_cpu0 = true;
5661 /* Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ */
5662 assert(bit_test(pset_array[0]->cpu_bitmask, 0));
5663 }
5664 #endif
5665
5666 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5667 pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5668 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5669 if (avoid_cpu0) {
5670 rt_target_map = bit_ror64(rt_target_map, 1);
5671 }
5672 int rotid = lsb_first(rt_target_map);
5673 if (rotid >= 0) {
5674 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5675 pset = pset_array[id];
5676 goto out;
5677 }
5678 }
5679 if (!pset->is_SMT || !sched_allow_rt_smt) {
5680 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5681 goto out;
5682 }
5683 rt_target_map = atomic_load(&node->pset_non_rt_map);
5684 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5685 if (avoid_cpu0) {
5686 rt_target_map = bit_ror64(rt_target_map, 1);
5687 }
5688 int rotid = lsb_first(rt_target_map);
5689 if (rotid >= 0) {
5690 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5691 pset = pset_array[id];
5692 goto out;
5693 }
5694 }
5695 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5696 } else {
5697 pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5698 if (!bit_test(idle_map, pset->pset_id)) {
5699 int next_idle_pset_id = lsb_first(idle_map);
5700 if (next_idle_pset_id >= 0) {
5701 pset = pset_array[next_idle_pset_id];
5702 }
5703 }
5704 }
5705
5706 out:
5707 if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5708 processor = PROCESSOR_NULL;
5709 }
5710 if (processor != PROCESSOR_NULL) {
5711 *processor_hint = processor;
5712 }
5713
5714 assert(pset != NULL);
5715 return pset;
5716 }
5717
5718 /*
5719 * thread_setrun:
5720 *
5721 * Dispatch thread for execution, onto an idle
5722 * processor or run queue, and signal a preemption
5723 * as appropriate.
5724 *
5725 * Thread must be locked.
5726 */
5727 void
thread_setrun(thread_t thread,sched_options_t options)5728 thread_setrun(
5729 thread_t thread,
5730 sched_options_t options)
5731 {
5732 processor_t processor = PROCESSOR_NULL;
5733 processor_set_t pset;
5734
5735 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5736 assert(thread->runq == PROCESSOR_NULL);
5737
5738 #if CONFIG_PREADOPT_TG
5739 /* We know that the thread is not in the runq by virtue of being in this
5740 * function and the thread is not self since we are running. We can safely
5741 * resolve the thread group hierarchy and modify the thread's thread group
5742 * here. */
5743 thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
5744 #endif
5745
5746 /*
5747 * Update priority if needed.
5748 */
5749 if (SCHED(can_update_priority)(thread)) {
5750 SCHED(update_priority)(thread);
5751 }
5752 thread->sfi_class = sfi_thread_classify(thread);
5753
5754 if (thread->bound_processor == PROCESSOR_NULL) {
5755 /*
5756 * Unbound case.
5757 *
5758 * Usually, this loop will only be executed once,
5759 * but if CLPC derecommends a processor after it has been chosen,
5760 * or if a processor is shut down after it is chosen,
5761 * choose_processor() may return NULL, so a retry
5762 * may be necessary. A single retry will usually
5763 * be enough, and we can't afford to retry too many times
5764 * because interrupts are disabled.
5765 */
5766 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
5767 for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5768 processor_t processor_hint = PROCESSOR_NULL;
5769 pset_node_t node = SCHED(choose_node)(thread);
5770 processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
5771
5772 pset_lock(starting_pset);
5773
5774 processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5775 if (processor != PROCESSOR_NULL) {
5776 pset = processor->processor_set;
5777 pset_assert_locked(pset);
5778 break;
5779 }
5780 }
5781 /*
5782 * If choose_processor() still returns NULL,
5783 * which is very unlikely,
5784 * choose the master_processor, which is always
5785 * safe to choose.
5786 */
5787 if (processor == PROCESSOR_NULL) {
5788 /* Choose fallback processor */
5789 processor = master_processor;
5790 pset = processor->processor_set;
5791 pset_lock(pset);
5792 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5793 }
5794 task_t task = get_threadtask(thread);
5795 if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5796 task->pset_hint = pset; /* NRG this is done without holding the task lock */
5797 }
5798 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5799 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5800 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5801 } else {
5802 /*
5803 * Bound case:
5804 *
5805 * Unconditionally dispatch on the processor.
5806 */
5807 processor = thread->bound_processor;
5808 pset = processor->processor_set;
5809 pset_lock(pset);
5810
5811 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5812 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5813 }
5814
5815 /*
5816 * Dispatch the thread on the chosen processor.
5817 * TODO: This should be based on sched_mode, not sched_pri
5818 */
5819 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5820 realtime_setrun(processor, thread);
5821 } else {
5822 processor_setrun(processor, thread, options);
5823 }
5824 /* pset is now unlocked */
5825 if (thread->bound_processor == PROCESSOR_NULL) {
5826 SCHED(check_spill)(pset, thread);
5827 }
5828 }
5829
5830 processor_set_t
task_choose_pset(task_t task)5831 task_choose_pset(
5832 task_t task)
5833 {
5834 processor_set_t pset = task->pset_hint;
5835
5836 if (pset != PROCESSOR_SET_NULL) {
5837 pset = choose_next_pset(pset);
5838 }
5839
5840 return pset;
5841 }
5842
5843 /*
5844 * Check for a preemption point in
5845 * the current context.
5846 *
5847 * Called at splsched with thread locked.
5848 */
5849 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)5850 csw_check(
5851 thread_t thread,
5852 processor_t processor,
5853 ast_t check_reason)
5854 {
5855 processor_set_t pset = processor->processor_set;
5856
5857 assert(thread == processor->active_thread);
5858
5859 pset_lock(pset);
5860
5861 processor_state_update_from_thread(processor, thread, true);
5862
5863 ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5864
5865 /* Acknowledge the IPI if we decided not to preempt */
5866
5867 if ((preempt & AST_URGENT) == 0) {
5868 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5869 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5870 }
5871 }
5872
5873 if ((preempt & AST_PREEMPT) == 0) {
5874 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5875 }
5876
5877 pset_unlock(pset);
5878
5879 return update_pending_nonurgent_preemption(processor, preempt);
5880 }
5881
5882 void
clear_pending_nonurgent_preemption(processor_t processor)5883 clear_pending_nonurgent_preemption(processor_t processor)
5884 {
5885 if (!processor->pending_nonurgent_preemption) {
5886 return;
5887 }
5888
5889 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_END);
5890
5891 processor->pending_nonurgent_preemption = false;
5892 running_timer_clear(processor, RUNNING_TIMER_PREEMPT);
5893 }
5894
5895 ast_t
update_pending_nonurgent_preemption(processor_t processor,ast_t reason)5896 update_pending_nonurgent_preemption(processor_t processor, ast_t reason)
5897 {
5898 if ((reason & (AST_URGENT | AST_PREEMPT)) != (AST_PREEMPT)) {
5899 clear_pending_nonurgent_preemption(processor);
5900 return reason;
5901 }
5902
5903 if (nonurgent_preemption_timer_abs == 0) {
5904 /* Preemption timer not enabled */
5905 return reason;
5906 }
5907
5908 if (current_thread()->state & TH_IDLE) {
5909 /* idle threads don't need nonurgent preemption */
5910 return reason;
5911 }
5912
5913 if (processor->pending_nonurgent_preemption) {
5914 /* Timer is already armed, no need to do it again */
5915 return reason;
5916 }
5917
5918 if (ml_did_interrupt_userspace()) {
5919 /*
5920 * We're preempting userspace here, so we don't need
5921 * to defer the preemption. Force AST_URGENT
5922 * so that we can avoid arming this timer without risking
5923 * ast_taken_user deciding to spend too long in kernel
5924 * space to handle other ASTs.
5925 */
5926
5927 return reason | AST_URGENT;
5928 }
5929
5930 /*
5931 * We've decided to do a nonurgent preemption when running in
5932 * kernelspace. We defer the preemption until reaching userspace boundary
5933 * to give a grace period for locks etc to be dropped and to reach
5934 * a clean preemption point, so that the preempting thread doesn't
5935 * always immediately hit the lock that the waking thread still holds.
5936 *
5937 * Arm a timer to enforce that the preemption executes within a bounded
5938 * time if the thread doesn't block or return to userspace quickly.
5939 */
5940
5941 processor->pending_nonurgent_preemption = true;
5942 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_START,
5943 reason);
5944
5945 uint64_t now = mach_absolute_time();
5946
5947 uint64_t deadline = now + nonurgent_preemption_timer_abs;
5948
5949 running_timer_enter(processor, RUNNING_TIMER_PREEMPT, NULL,
5950 now, deadline);
5951
5952 return reason;
5953 }
5954
5955 /*
5956 * Check for preemption at splsched with
5957 * pset and thread locked
5958 */
5959 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)5960 csw_check_locked(
5961 thread_t thread,
5962 processor_t processor,
5963 processor_set_t pset,
5964 ast_t check_reason)
5965 {
5966 /*
5967 * If the current thread is running on a processor that is no longer recommended,
5968 * urgently preempt it, at which point thread_select() should
5969 * try to idle the processor and re-dispatch the thread to a recommended processor.
5970 */
5971 if (!processor->is_recommended) {
5972 return check_reason | AST_PREEMPT | AST_URGENT;
5973 }
5974
5975 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5976 return check_reason | AST_PREEMPT | AST_URGENT;
5977 }
5978
5979 if (rt_runq_count(pset) > 0) {
5980 if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5981 return check_reason | AST_PREEMPT | AST_URGENT;
5982 } else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) {
5983 return check_reason | AST_PREEMPT | AST_URGENT;
5984 } else {
5985 return check_reason | AST_PREEMPT;
5986 }
5987 }
5988
5989 ast_t result = SCHED(processor_csw_check)(processor);
5990 if (result != AST_NONE) {
5991 return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
5992 }
5993
5994 /*
5995 * Same for avoid-processor
5996 *
5997 * TODO: Should these set AST_REBALANCE?
5998 */
5999 if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, check_reason)) {
6000 return check_reason | AST_PREEMPT;
6001 }
6002
6003 /*
6004 * Even though we could continue executing on this processor, a
6005 * secondary SMT core should try to shed load to another primary core.
6006 *
6007 * TODO: Should this do the same check that thread_select does? i.e.
6008 * if no bound threads target this processor, and idle primaries exist, preempt
6009 * The case of RT threads existing is already taken care of above
6010 */
6011
6012 if (processor->current_pri < BASEPRI_RTQUEUES &&
6013 processor->processor_primary != processor) {
6014 return check_reason | AST_PREEMPT;
6015 }
6016
6017 if (thread->state & TH_SUSP) {
6018 return check_reason | AST_PREEMPT;
6019 }
6020
6021 #if CONFIG_SCHED_SFI
6022 /*
6023 * Current thread may not need to be preempted, but maybe needs
6024 * an SFI wait?
6025 */
6026 result = sfi_thread_needs_ast(thread, NULL);
6027 if (result != AST_NONE) {
6028 return result;
6029 }
6030 #endif
6031
6032 return AST_NONE;
6033 }
6034
6035 /*
6036 * Handle preemption IPI or IPI in response to setting an AST flag
6037 * Triggered by cause_ast_check
6038 * Called at splsched
6039 */
6040 void
ast_check(processor_t processor)6041 ast_check(processor_t processor)
6042 {
6043 smr_ack_ipi();
6044
6045 if (processor->state != PROCESSOR_RUNNING &&
6046 processor->state != PROCESSOR_SHUTDOWN) {
6047 return;
6048 }
6049
6050 SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6051 MACH_SCHED_AST_CHECK) | DBG_FUNC_START);
6052
6053 thread_t thread = processor->active_thread;
6054
6055 assert(thread == current_thread());
6056
6057 /*
6058 * Pairs with task_restartable_ranges_synchronize
6059 */
6060 thread_lock(thread);
6061
6062 thread_reset_pcs_ack_IPI(thread);
6063
6064 /*
6065 * Propagate thread ast to processor.
6066 * (handles IPI in response to setting AST flag)
6067 */
6068 ast_propagate(thread);
6069
6070 /*
6071 * Stash the old urgency and perfctl values to find out if
6072 * csw_check updates them.
6073 */
6074 thread_urgency_t old_urgency = processor->current_urgency;
6075 perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
6076
6077 ast_t preempt;
6078
6079 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6080 ast_on(preempt);
6081 }
6082
6083 if (old_urgency != processor->current_urgency) {
6084 /*
6085 * Urgency updates happen with the thread lock held (ugh).
6086 * TODO: This doesn't notice QoS changes...
6087 */
6088 uint64_t urgency_param1, urgency_param2;
6089
6090 thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
6091 thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
6092 }
6093
6094 thread_unlock(thread);
6095
6096 if (old_perfctl_class != processor->current_perfctl_class) {
6097 /*
6098 * We updated the perfctl class of this thread from another core.
6099 * Let CLPC know that the currently running thread has a new
6100 * class.
6101 */
6102
6103 machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
6104 mach_approximate_time(), 0, thread);
6105 }
6106
6107 SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6108 MACH_SCHED_AST_CHECK) | DBG_FUNC_END, preempt);
6109 }
6110
6111
6112 void
thread_preempt_expire(timer_call_param_t p0,__unused timer_call_param_t p1)6113 thread_preempt_expire(
6114 timer_call_param_t p0,
6115 __unused timer_call_param_t p1)
6116 {
6117 processor_t processor = p0;
6118
6119 assert(processor == current_processor());
6120 assert(p1 == NULL);
6121
6122 thread_t thread = current_thread();
6123
6124 /*
6125 * This is set and cleared by the current core, so we will
6126 * never see a race with running timer expiration
6127 */
6128 assert(processor->pending_nonurgent_preemption);
6129
6130 clear_pending_nonurgent_preemption(processor);
6131
6132 thread_lock(thread);
6133
6134 /*
6135 * Check again to see if it's still worth a
6136 * context switch, but this time force enable kernel preemption
6137 */
6138
6139 ast_t preempt = csw_check(thread, processor, AST_URGENT);
6140
6141 if (preempt) {
6142 ast_on(preempt);
6143 }
6144
6145 thread_unlock(thread);
6146
6147 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt);
6148 }
6149
6150
6151 /*
6152 * set_sched_pri:
6153 *
6154 * Set the scheduled priority of the specified thread.
6155 *
6156 * This may cause the thread to change queues.
6157 *
6158 * Thread must be locked.
6159 */
6160 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)6161 set_sched_pri(
6162 thread_t thread,
6163 int16_t new_priority,
6164 set_sched_pri_options_t options)
6165 {
6166 bool is_current_thread = (thread == current_thread());
6167 bool removed_from_runq = false;
6168 bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
6169
6170 int16_t old_priority = thread->sched_pri;
6171
6172 /* If we're already at this priority, no need to mess with the runqueue */
6173 if (new_priority == old_priority) {
6174 #if CONFIG_SCHED_CLUTCH
6175 /* For the first thread in the system, the priority is correct but
6176 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
6177 * scheduler relies on the bucket being set for all threads, update
6178 * its bucket here.
6179 */
6180 if (thread->th_sched_bucket == TH_BUCKET_RUN) {
6181 assert(thread == vm_pageout_scan_thread);
6182 SCHED(update_thread_bucket)(thread);
6183 }
6184 #endif /* CONFIG_SCHED_CLUTCH */
6185
6186 return;
6187 }
6188
6189 if (is_current_thread) {
6190 assert(thread->state & TH_RUN);
6191 assert(thread->runq == PROCESSOR_NULL);
6192 } else {
6193 removed_from_runq = thread_run_queue_remove(thread);
6194 }
6195
6196 thread->sched_pri = new_priority;
6197
6198 #if CONFIG_SCHED_CLUTCH
6199 /*
6200 * Since for the clutch scheduler, the thread's bucket determines its runq
6201 * in the hierarchy it is important to update the bucket when the thread
6202 * lock is held and the thread has been removed from the runq hierarchy.
6203 */
6204 SCHED(update_thread_bucket)(thread);
6205
6206 #endif /* CONFIG_SCHED_CLUTCH */
6207
6208 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
6209 (uintptr_t)thread_tid(thread),
6210 thread->base_pri,
6211 thread->sched_pri,
6212 thread->sched_usage,
6213 0);
6214
6215 if (removed_from_runq) {
6216 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
6217 } else if (is_current_thread) {
6218 processor_t processor = thread->last_processor;
6219 assert(processor == current_processor());
6220
6221 thread_urgency_t old_urgency = processor->current_urgency;
6222
6223 /*
6224 * When dropping in priority, check if the thread no longer belongs on core.
6225 * If a thread raises its own priority, don't aggressively rebalance it.
6226 * <rdar://problem/31699165>
6227 *
6228 * csw_check does a processor_state_update_from_thread, but
6229 * we should do our own if we're being lazy.
6230 */
6231 if (!lazy_update && new_priority < old_priority) {
6232 ast_t preempt;
6233
6234 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6235 ast_on(preempt);
6236 }
6237 } else {
6238 processor_state_update_from_thread(processor, thread, false);
6239 }
6240
6241 /*
6242 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
6243 * class alterations from user space to occur relatively infrequently, hence
6244 * those are lazily handled. QoS classes have distinct priority bands, and QoS
6245 * inheritance is expected to involve priority changes.
6246 */
6247 if (processor->current_urgency != old_urgency) {
6248 uint64_t urgency_param1, urgency_param2;
6249
6250 thread_urgency_t new_urgency = thread_get_urgency(thread,
6251 &urgency_param1, &urgency_param2);
6252
6253 thread_tell_urgency(new_urgency, urgency_param1,
6254 urgency_param2, 0, thread);
6255 }
6256
6257 /* TODO: only call this if current_perfctl_class changed */
6258 uint64_t ctime = mach_approximate_time();
6259 machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
6260 } else if (thread->state & TH_RUN) {
6261 processor_t processor = thread->last_processor;
6262
6263 if (!lazy_update &&
6264 processor != PROCESSOR_NULL &&
6265 processor != current_processor() &&
6266 processor->active_thread == thread) {
6267 cause_ast_check(processor);
6268 }
6269 }
6270 }
6271
6272 /*
6273 * thread_run_queue_remove_for_handoff
6274 *
6275 * Pull a thread or its (recursive) push target out of the runqueue
6276 * so that it is ready for thread_run()
6277 *
6278 * Called at splsched
6279 *
6280 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6281 * This may be different than the thread that was passed in.
6282 */
6283 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)6284 thread_run_queue_remove_for_handoff(thread_t thread)
6285 {
6286 thread_t pulled_thread = THREAD_NULL;
6287
6288 thread_lock(thread);
6289
6290 /*
6291 * Check that the thread is not bound to a different processor,
6292 * NO_SMT flag is not set on the thread, cluster type of
6293 * processor matches with thread if the thread is pinned to a
6294 * particular cluster and that realtime is not involved.
6295 *
6296 * Next, pull it off its run queue. If it doesn't come, it's not eligible.
6297 */
6298 processor_t processor = current_processor();
6299 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6300 && (!thread_no_smt(thread))
6301 && (processor->current_pri < BASEPRI_RTQUEUES)
6302 && (thread->sched_pri < BASEPRI_RTQUEUES)
6303 #if __AMP__
6304 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6305 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6306 #endif /* __AMP__ */
6307 ) {
6308 if (thread_run_queue_remove(thread)) {
6309 pulled_thread = thread;
6310 }
6311 }
6312
6313 thread_unlock(thread);
6314
6315 return pulled_thread;
6316 }
6317
6318 /*
6319 * thread_prepare_for_handoff
6320 *
6321 * Make the thread ready for handoff.
6322 * If the thread was runnable then pull it off the runq, if the thread could
6323 * not be pulled, return NULL.
6324 *
6325 * If the thread was woken up from wait for handoff, make sure it is not bound to
6326 * different processor.
6327 *
6328 * Called at splsched
6329 *
6330 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6331 * This may be different than the thread that was passed in.
6332 */
6333 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)6334 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6335 {
6336 thread_t pulled_thread = THREAD_NULL;
6337
6338 if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6339 processor_t processor = current_processor();
6340 thread_lock(thread);
6341
6342 /*
6343 * Check that the thread is not bound to a different processor,
6344 * NO_SMT flag is not set on the thread and cluster type of
6345 * processor matches with thread if the thread is pinned to a
6346 * particular cluster. Call setrun instead if above conditions
6347 * are not satisfied.
6348 */
6349 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6350 && (!thread_no_smt(thread))
6351 #if __AMP__
6352 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6353 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6354 #endif /* __AMP__ */
6355 ) {
6356 pulled_thread = thread;
6357 } else {
6358 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
6359 }
6360 thread_unlock(thread);
6361 } else {
6362 pulled_thread = thread_run_queue_remove_for_handoff(thread);
6363 }
6364
6365 return pulled_thread;
6366 }
6367
6368 /*
6369 * thread_run_queue_remove:
6370 *
6371 * Remove a thread from its current run queue and
6372 * return TRUE if successful.
6373 *
6374 * Thread must be locked.
6375 *
6376 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6377 * run queues because the caller locked the thread. Otherwise
6378 * the thread is on a run queue, but could be chosen for dispatch
6379 * and removed by another processor under a different lock, which
6380 * will set thread->runq to PROCESSOR_NULL.
6381 *
6382 * Hence the thread select path must not rely on anything that could
6383 * be changed under the thread lock after calling this function,
6384 * most importantly thread->sched_pri.
6385 */
6386 boolean_t
thread_run_queue_remove(thread_t thread)6387 thread_run_queue_remove(
6388 thread_t thread)
6389 {
6390 boolean_t removed = FALSE;
6391 processor_t processor = thread->runq;
6392
6393 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6394 /* Thread isn't runnable */
6395 assert(thread->runq == PROCESSOR_NULL);
6396 return FALSE;
6397 }
6398
6399 if (processor == PROCESSOR_NULL) {
6400 /*
6401 * The thread is either not on the runq,
6402 * or is in the midst of being removed from the runq.
6403 *
6404 * runq is set to NULL under the pset lock, not the thread
6405 * lock, so the thread may still be in the process of being dequeued
6406 * from the runq. It will wait in invoke for the thread lock to be
6407 * dropped.
6408 */
6409
6410 return FALSE;
6411 }
6412
6413 if (thread->sched_pri < BASEPRI_RTQUEUES) {
6414 return SCHED(processor_queue_remove)(processor, thread);
6415 }
6416
6417 processor_set_t pset = processor->processor_set;
6418
6419 pset_lock(pset);
6420
6421 if (thread->runq != PROCESSOR_NULL) {
6422 /*
6423 * Thread is on the RT run queue and we have a lock on
6424 * that run queue.
6425 */
6426 rt_runq_remove(SCHED(rt_runq)(pset), thread);
6427 pset_update_rt_stealable_state(pset);
6428
6429 removed = TRUE;
6430 }
6431
6432 pset_unlock(pset);
6433
6434 return removed;
6435 }
6436
6437 /*
6438 * Put the thread back where it goes after a thread_run_queue_remove
6439 *
6440 * Thread must have been removed under the same thread lock hold
6441 *
6442 * thread locked, at splsched
6443 */
6444 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)6445 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6446 {
6447 assert(thread->runq == PROCESSOR_NULL);
6448 assert(thread->state & (TH_RUN));
6449
6450 thread_setrun(thread, options);
6451 }
6452
6453 void
sys_override_cpu_throttle(boolean_t enable_override)6454 sys_override_cpu_throttle(boolean_t enable_override)
6455 {
6456 if (enable_override) {
6457 cpu_throttle_enabled = 0;
6458 } else {
6459 cpu_throttle_enabled = 1;
6460 }
6461 }
6462
6463 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)6464 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6465 {
6466 uint64_t urgency_param1 = 0, urgency_param2 = 0;
6467 task_t task = get_threadtask_early(thread);
6468
6469 thread_urgency_t urgency;
6470
6471 if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6472 urgency_param1 = 0;
6473 urgency_param2 = 0;
6474
6475 urgency = THREAD_URGENCY_NONE;
6476 } else if (thread->sched_mode == TH_MODE_REALTIME) {
6477 urgency_param1 = thread->realtime.period;
6478 urgency_param2 = thread->realtime.deadline;
6479
6480 urgency = THREAD_URGENCY_REAL_TIME;
6481 } else if (cpu_throttle_enabled &&
6482 (thread->sched_pri <= MAXPRI_THROTTLE) &&
6483 (thread->base_pri <= MAXPRI_THROTTLE)) {
6484 /*
6485 * Threads that are running at low priority but are not
6486 * tagged with a specific QoS are separated out from
6487 * the "background" urgency. Performance management
6488 * subsystem can decide to either treat these threads
6489 * as normal threads or look at other signals like thermal
6490 * levels for optimal power/perf tradeoffs for a platform.
6491 */
6492 boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6493 boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6494
6495 /*
6496 * Background urgency applied when thread priority is
6497 * MAXPRI_THROTTLE or lower and thread is not promoted
6498 * and thread has a QoS specified
6499 */
6500 urgency_param1 = thread->sched_pri;
6501 urgency_param2 = thread->base_pri;
6502
6503 if (thread_lacks_qos && !task_is_suppressed) {
6504 urgency = THREAD_URGENCY_LOWPRI;
6505 } else {
6506 urgency = THREAD_URGENCY_BACKGROUND;
6507 }
6508 } else {
6509 /* For otherwise unclassified threads, report throughput QoS parameters */
6510 urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6511 urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6512 urgency = THREAD_URGENCY_NORMAL;
6513 }
6514
6515 if (arg1 != NULL) {
6516 *arg1 = urgency_param1;
6517 }
6518 if (arg2 != NULL) {
6519 *arg2 = urgency_param2;
6520 }
6521
6522 return urgency;
6523 }
6524
6525 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)6526 thread_get_perfcontrol_class(thread_t thread)
6527 {
6528 /* Special case handling */
6529 if (thread->state & TH_IDLE) {
6530 return PERFCONTROL_CLASS_IDLE;
6531 }
6532
6533 if (thread->sched_mode == TH_MODE_REALTIME) {
6534 return PERFCONTROL_CLASS_REALTIME;
6535 }
6536
6537 /* perfcontrol_class based on base_pri */
6538 if (thread->base_pri <= MAXPRI_THROTTLE) {
6539 return PERFCONTROL_CLASS_BACKGROUND;
6540 } else if (thread->base_pri <= BASEPRI_UTILITY) {
6541 return PERFCONTROL_CLASS_UTILITY;
6542 } else if (thread->base_pri <= BASEPRI_DEFAULT) {
6543 return PERFCONTROL_CLASS_NONUI;
6544 } else if (thread->base_pri <= BASEPRI_USER_INITIATED) {
6545 return PERFCONTROL_CLASS_USER_INITIATED;
6546 } else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6547 return PERFCONTROL_CLASS_UI;
6548 } else {
6549 if (get_threadtask(thread) == kernel_task) {
6550 /*
6551 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6552 * All other lower priority kernel threads should be treated
6553 * as regular threads for performance control purposes.
6554 */
6555 return PERFCONTROL_CLASS_KERNEL;
6556 }
6557 return PERFCONTROL_CLASS_ABOVEUI;
6558 }
6559 }
6560
6561 /*
6562 * This is the processor idle loop, which just looks for other threads
6563 * to execute. Processor idle threads invoke this without supplying a
6564 * current thread to idle without an asserted wait state.
6565 *
6566 * Returns a the next thread to execute if dispatched directly.
6567 */
6568
6569 #if 0
6570 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6571 #else
6572 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6573 #endif
6574
6575 #if (DEVELOPMENT || DEBUG)
6576 int sched_idle_delay_cpuid = -1;
6577 #endif
6578
6579 thread_t
processor_idle(thread_t thread,processor_t processor)6580 processor_idle(
6581 thread_t thread,
6582 processor_t processor)
6583 {
6584 processor_set_t pset = processor->processor_set;
6585 struct recount_snap snap = { 0 };
6586
6587 (void)splsched();
6588
6589 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6590 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6591 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6592
6593 SCHED_STATS_INC(idle_transitions);
6594 assert(processor->running_timers_active == false);
6595
6596 recount_snapshot(&snap);
6597 recount_processor_idle(&processor->pr_recount, &snap);
6598
6599 while (1) {
6600 /*
6601 * Ensure that updates to my processor and pset state,
6602 * made by the IPI source processor before sending the IPI,
6603 * are visible on this processor now (even though we don't
6604 * take the pset lock yet).
6605 */
6606 atomic_thread_fence(memory_order_acquire);
6607
6608 if (processor->state != PROCESSOR_IDLE) {
6609 break;
6610 }
6611 if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6612 break;
6613 }
6614 #if defined(CONFIG_SCHED_DEFERRED_AST)
6615 if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6616 break;
6617 }
6618 #endif
6619 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6620 break;
6621 }
6622
6623 if (processor->is_recommended && (processor->processor_primary == processor)) {
6624 if (rt_runq_count(pset)) {
6625 break;
6626 }
6627 } else {
6628 if (SCHED(processor_bound_count)(processor)) {
6629 break;
6630 }
6631 }
6632
6633 IDLE_KERNEL_DEBUG_CONSTANT(
6634 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6635
6636 machine_track_platform_idle(TRUE);
6637
6638 machine_idle();
6639 /* returns with interrupts enabled */
6640
6641 machine_track_platform_idle(FALSE);
6642
6643 #if (DEVELOPMENT || DEBUG)
6644 if (processor->cpu_id == sched_idle_delay_cpuid) {
6645 delay(500);
6646 }
6647 #endif
6648
6649 (void)splsched();
6650
6651 atomic_thread_fence(memory_order_acquire);
6652
6653 IDLE_KERNEL_DEBUG_CONSTANT(
6654 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6655
6656 /*
6657 * Check if we should call sched_timeshare_consider_maintenance() here.
6658 * The CPU was woken out of idle due to an interrupt and we should do the
6659 * call only if the processor is still idle. If the processor is non-idle,
6660 * the threads running on the processor would do the call as part of
6661 * context swithing.
6662 */
6663 if (processor->state == PROCESSOR_IDLE) {
6664 sched_timeshare_consider_maintenance(mach_absolute_time(), true);
6665 }
6666
6667 if (!SCHED(processor_queue_empty)(processor)) {
6668 /* Secondary SMT processors respond to directed wakeups
6669 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6670 */
6671 if (processor->processor_primary == processor) {
6672 break;
6673 }
6674 }
6675 }
6676
6677 recount_snapshot(&snap);
6678 recount_processor_run(&processor->pr_recount, &snap);
6679 smr_cpu_join(processor, snap.rsn_time_mach);
6680
6681 ast_t reason = AST_NONE;
6682
6683 /* We're handling all scheduling AST's */
6684 ast_off(AST_SCHEDULING);
6685
6686 /*
6687 * thread_select will move the processor from dispatching to running,
6688 * or put it in idle if there's nothing to do.
6689 */
6690 thread_t cur_thread = current_thread();
6691
6692 thread_lock(cur_thread);
6693 thread_t new_thread = thread_select(cur_thread, processor, &reason);
6694 thread_unlock(cur_thread);
6695
6696 assert(processor->running_timers_active == false);
6697
6698 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6699 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6700 (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6701
6702 return new_thread;
6703 }
6704
6705 /*
6706 * Each processor has a dedicated thread which
6707 * executes the idle loop when there is no suitable
6708 * previous context.
6709 *
6710 * This continuation is entered with interrupts disabled.
6711 */
6712 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)6713 idle_thread(__assert_only void* parameter,
6714 __unused wait_result_t result)
6715 {
6716 assert(ml_get_interrupts_enabled() == FALSE);
6717 assert(parameter == NULL);
6718
6719 processor_t processor = current_processor();
6720
6721 smr_cpu_leave(processor, processor->last_dispatch);
6722
6723 /*
6724 * Ensure that anything running in idle context triggers
6725 * preemption-disabled checks.
6726 */
6727 disable_preemption_without_measurements();
6728
6729 /*
6730 * Enable interrupts temporarily to handle any pending interrupts
6731 * or IPIs before deciding to sleep
6732 */
6733 spllo();
6734
6735 thread_t new_thread = processor_idle(THREAD_NULL, processor);
6736 /* returns with interrupts disabled */
6737
6738 enable_preemption();
6739
6740 if (new_thread != THREAD_NULL) {
6741 thread_run(processor->idle_thread,
6742 idle_thread, NULL, new_thread);
6743 /*NOTREACHED*/
6744 }
6745
6746 thread_block(idle_thread);
6747 /*NOTREACHED*/
6748 }
6749
6750 kern_return_t
idle_thread_create(processor_t processor)6751 idle_thread_create(
6752 processor_t processor)
6753 {
6754 kern_return_t result;
6755 thread_t thread;
6756 spl_t s;
6757 char name[MAXTHREADNAMESIZE];
6758
6759 result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
6760 if (result != KERN_SUCCESS) {
6761 return result;
6762 }
6763
6764 snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6765 thread_set_thread_name(thread, name);
6766
6767 s = splsched();
6768 thread_lock(thread);
6769 thread->bound_processor = processor;
6770 processor->idle_thread = thread;
6771 thread->sched_pri = thread->base_pri = IDLEPRI;
6772 thread->state = (TH_RUN | TH_IDLE);
6773 thread->options |= TH_OPT_IDLE_THREAD;
6774 thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6775 thread_unlock(thread);
6776 splx(s);
6777
6778 thread_deallocate(thread);
6779
6780 return KERN_SUCCESS;
6781 }
6782
6783 static void sched_update_powered_cores_continue(void);
6784
6785 /*
6786 * sched_startup:
6787 *
6788 * Kicks off scheduler services.
6789 *
6790 * Called at splsched.
6791 */
6792 void
sched_startup(void)6793 sched_startup(void)
6794 {
6795 kern_return_t result;
6796 thread_t thread;
6797
6798 simple_lock_init(&sched_vm_group_list_lock, 0);
6799
6800 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
6801 NULL, MAXPRI_KERNEL, &thread);
6802 if (result != KERN_SUCCESS) {
6803 panic("sched_startup");
6804 }
6805
6806 thread_deallocate(thread);
6807
6808 assert_thread_magic(thread);
6809
6810 /*
6811 * Yield to the sched_init_thread once, to
6812 * initialize our own thread after being switched
6813 * back to.
6814 *
6815 * The current thread is the only other thread
6816 * active at this point.
6817 */
6818 thread_block(THREAD_CONTINUE_NULL);
6819
6820 result = kernel_thread_start_priority((thread_continue_t)sched_update_powered_cores_continue,
6821 NULL, MAXPRI_KERNEL, &thread);
6822 if (result != KERN_SUCCESS) {
6823 panic("sched_startup");
6824 }
6825
6826 thread_deallocate(thread);
6827
6828 assert_thread_magic(thread);
6829 }
6830
6831 #if __arm64__
6832 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6833 #endif /* __arm64__ */
6834
6835
6836 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6837
6838 static volatile uint64_t sched_maintenance_deadline;
6839 static uint64_t sched_tick_last_abstime;
6840 static uint64_t sched_tick_delta;
6841 uint64_t sched_tick_max_delta;
6842
6843
6844 /*
6845 * sched_init_thread:
6846 *
6847 * Perform periodic bookkeeping functions about ten
6848 * times per second.
6849 */
6850 void
sched_timeshare_maintenance_continue(void)6851 sched_timeshare_maintenance_continue(void)
6852 {
6853 uint64_t sched_tick_ctime, late_time;
6854
6855 struct sched_update_scan_context scan_context = {
6856 .earliest_bg_make_runnable_time = UINT64_MAX,
6857 .earliest_normal_make_runnable_time = UINT64_MAX,
6858 .earliest_rt_make_runnable_time = UINT64_MAX
6859 };
6860
6861 sched_tick_ctime = mach_absolute_time();
6862
6863 if (__improbable(sched_tick_last_abstime == 0)) {
6864 sched_tick_last_abstime = sched_tick_ctime;
6865 late_time = 0;
6866 sched_tick_delta = 1;
6867 } else {
6868 late_time = sched_tick_ctime - sched_tick_last_abstime;
6869 sched_tick_delta = late_time / sched_tick_interval;
6870 /* Ensure a delta of 1, since the interval could be slightly
6871 * smaller than the sched_tick_interval due to dispatch
6872 * latencies.
6873 */
6874 sched_tick_delta = MAX(sched_tick_delta, 1);
6875
6876 /* In the event interrupt latencies or platform
6877 * idle events that advanced the timebase resulted
6878 * in periods where no threads were dispatched,
6879 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6880 * iterations.
6881 */
6882 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6883
6884 sched_tick_last_abstime = sched_tick_ctime;
6885 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6886 }
6887
6888 scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6889 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6890 sched_tick_delta, late_time, 0, 0, 0);
6891
6892 /* Add a number of pseudo-ticks corresponding to the elapsed interval
6893 * This could be greater than 1 if substantial intervals where
6894 * all processors are idle occur, which rarely occurs in practice.
6895 */
6896
6897 sched_tick += sched_tick_delta;
6898
6899 update_vm_info();
6900
6901 /*
6902 * Compute various averages.
6903 */
6904 compute_averages(sched_tick_delta);
6905
6906 /*
6907 * Scan the run queues for threads which
6908 * may need to be updated, and find the earliest runnable thread on the runqueue
6909 * to report its latency.
6910 */
6911 SCHED(thread_update_scan)(&scan_context);
6912
6913 SCHED(rt_runq_scan)(&scan_context);
6914
6915 uint64_t ctime = mach_absolute_time();
6916
6917 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6918 ctime - scan_context.earliest_bg_make_runnable_time : 0;
6919
6920 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6921 ctime - scan_context.earliest_normal_make_runnable_time : 0;
6922
6923 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6924 ctime - scan_context.earliest_rt_make_runnable_time : 0;
6925
6926 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6927
6928 /*
6929 * Check to see if the special sched VM group needs attention.
6930 */
6931 sched_vm_group_maintenance();
6932
6933 #if __arm64__
6934 /* Check to see if the recommended cores failsafe is active */
6935 sched_recommended_cores_maintenance();
6936 #endif /* __arm64__ */
6937
6938
6939 #if DEBUG || DEVELOPMENT
6940 #if __x86_64__
6941 #include <i386/misc_protos.h>
6942 /* Check for long-duration interrupts */
6943 mp_interrupt_watchdog();
6944 #endif /* __x86_64__ */
6945 #endif /* DEBUG || DEVELOPMENT */
6946
6947 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6948 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6949 sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6950
6951 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6952 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
6953 /*NOTREACHED*/
6954 }
6955
6956 static uint64_t sched_maintenance_wakeups;
6957
6958 /*
6959 * Determine if the set of routines formerly driven by a maintenance timer
6960 * must be invoked, based on a deadline comparison. Signals the scheduler
6961 * maintenance thread on deadline expiration. Must be invoked at an interval
6962 * lower than the "sched_tick_interval", currently accomplished by
6963 * invocation via the quantum expiration timer and at context switch time.
6964 * Performance matters: this routine reuses a timestamp approximating the
6965 * current absolute time received from the caller, and should perform
6966 * no more than a comparison against the deadline in the common case.
6967 */
6968 void
sched_timeshare_consider_maintenance(uint64_t ctime,bool safe_point)6969 sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point)
6970 {
6971 uint64_t deadline = sched_maintenance_deadline;
6972
6973 if (__improbable(ctime >= deadline)) {
6974 if (__improbable(current_thread() == sched_maintenance_thread)) {
6975 return;
6976 }
6977 OSMemoryBarrier();
6978
6979 uint64_t ndeadline = ctime + sched_tick_interval;
6980
6981 if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6982 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
6983 sched_maintenance_wakeups++;
6984 smr_maintenance(ctime);
6985 }
6986 }
6987
6988 smr_cpu_tick(ctime, safe_point);
6989
6990 #if !CONFIG_SCHED_CLUTCH
6991 /*
6992 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
6993 * scheduler, the load is maintained at the thread group and bucket level.
6994 */
6995 uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
6996
6997 if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
6998 uint64_t new_deadline = 0;
6999 if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
7000 compute_sched_load();
7001 new_deadline = ctime + sched_load_compute_interval_abs;
7002 os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
7003 }
7004 }
7005 #endif /* CONFIG_SCHED_CLUTCH */
7006
7007 #if __arm64__
7008 uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
7009
7010 if (__improbable(perf_deadline && ctime >= perf_deadline)) {
7011 /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
7012 if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
7013 machine_perfcontrol_deadline_passed(perf_deadline);
7014 }
7015 }
7016 #endif /* __arm64__ */
7017 }
7018
7019 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7020
7021 void
sched_init_thread(void)7022 sched_init_thread(void)
7023 {
7024 thread_block(THREAD_CONTINUE_NULL);
7025
7026 thread_t thread = current_thread();
7027
7028 thread_set_thread_name(thread, "sched_maintenance_thread");
7029
7030 sched_maintenance_thread = thread;
7031
7032 SCHED(maintenance_continuation)();
7033
7034 /*NOTREACHED*/
7035 }
7036
7037 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
7038
7039 /*
7040 * thread_update_scan / runq_scan:
7041 *
7042 * Scan the run queues to account for timesharing threads
7043 * which need to be updated.
7044 *
7045 * Scanner runs in two passes. Pass one squirrels likely
7046 * threads away in an array, pass two does the update.
7047 *
7048 * This is necessary because the run queue is locked for
7049 * the candidate scan, but the thread is locked for the update.
7050 *
7051 * Array should be sized to make forward progress, without
7052 * disabling preemption for long periods.
7053 */
7054
7055 #define THREAD_UPDATE_SIZE 128
7056
7057 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
7058 static uint32_t thread_update_count = 0;
7059
7060 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
7061 boolean_t
thread_update_add_thread(thread_t thread)7062 thread_update_add_thread(thread_t thread)
7063 {
7064 if (thread_update_count == THREAD_UPDATE_SIZE) {
7065 return FALSE;
7066 }
7067
7068 thread_update_array[thread_update_count++] = thread;
7069 thread_reference(thread);
7070 return TRUE;
7071 }
7072
7073 void
thread_update_process_threads(void)7074 thread_update_process_threads(void)
7075 {
7076 assert(thread_update_count <= THREAD_UPDATE_SIZE);
7077
7078 for (uint32_t i = 0; i < thread_update_count; i++) {
7079 thread_t thread = thread_update_array[i];
7080 assert_thread_magic(thread);
7081 thread_update_array[i] = THREAD_NULL;
7082
7083 spl_t s = splsched();
7084 thread_lock(thread);
7085 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
7086 SCHED(update_priority)(thread);
7087 }
7088 thread_unlock(thread);
7089 splx(s);
7090
7091 thread_deallocate(thread);
7092 }
7093
7094 thread_update_count = 0;
7095 }
7096
7097 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)7098 runq_scan_thread(
7099 thread_t thread,
7100 sched_update_scan_context_t scan_context)
7101 {
7102 assert_thread_magic(thread);
7103
7104 if (thread->sched_stamp != sched_tick &&
7105 thread->sched_mode == TH_MODE_TIMESHARE) {
7106 if (thread_update_add_thread(thread) == FALSE) {
7107 return TRUE;
7108 }
7109 }
7110
7111 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
7112 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
7113 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
7114 }
7115 } else {
7116 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
7117 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
7118 }
7119 }
7120
7121 return FALSE;
7122 }
7123
7124 /*
7125 * Scan a runq for candidate threads.
7126 *
7127 * Returns TRUE if retry is needed.
7128 */
7129 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)7130 runq_scan(
7131 run_queue_t runq,
7132 sched_update_scan_context_t scan_context)
7133 {
7134 int count = runq->count;
7135 int queue_index;
7136
7137 assert(count >= 0);
7138
7139 if (count == 0) {
7140 return FALSE;
7141 }
7142
7143 for (queue_index = bitmap_first(runq->bitmap, NRQS);
7144 queue_index >= 0;
7145 queue_index = bitmap_next(runq->bitmap, queue_index)) {
7146 thread_t thread;
7147 circle_queue_t queue = &runq->queues[queue_index];
7148
7149 cqe_foreach_element(thread, queue, runq_links) {
7150 assert(count > 0);
7151 if (runq_scan_thread(thread, scan_context) == TRUE) {
7152 return TRUE;
7153 }
7154 count--;
7155 }
7156 }
7157
7158 return FALSE;
7159 }
7160
7161 #if CONFIG_SCHED_CLUTCH
7162
7163 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)7164 sched_clutch_timeshare_scan(
7165 queue_t thread_queue,
7166 uint16_t thread_count,
7167 sched_update_scan_context_t scan_context)
7168 {
7169 if (thread_count == 0) {
7170 return FALSE;
7171 }
7172
7173 thread_t thread;
7174 qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
7175 if (runq_scan_thread(thread, scan_context) == TRUE) {
7176 return TRUE;
7177 }
7178 thread_count--;
7179 }
7180
7181 assert(thread_count == 0);
7182 return FALSE;
7183 }
7184
7185
7186 #endif /* CONFIG_SCHED_CLUTCH */
7187
7188 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7189
7190 bool
thread_is_eager_preempt(thread_t thread)7191 thread_is_eager_preempt(thread_t thread)
7192 {
7193 return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
7194 }
7195
7196 void
thread_set_eager_preempt(thread_t thread)7197 thread_set_eager_preempt(thread_t thread)
7198 {
7199 spl_t s = splsched();
7200 thread_lock(thread);
7201
7202 assert(!thread_is_eager_preempt(thread));
7203
7204 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
7205
7206 if (thread == current_thread()) {
7207 /* csw_check updates current_is_eagerpreempt on the processor */
7208 ast_t ast = csw_check(thread, current_processor(), AST_NONE);
7209
7210 thread_unlock(thread);
7211
7212 if (ast != AST_NONE) {
7213 thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
7214 }
7215 } else {
7216 processor_t last_processor = thread->last_processor;
7217
7218 if (last_processor != PROCESSOR_NULL &&
7219 last_processor->state == PROCESSOR_RUNNING &&
7220 last_processor->active_thread == thread) {
7221 cause_ast_check(last_processor);
7222 }
7223
7224 thread_unlock(thread);
7225 }
7226
7227 splx(s);
7228 }
7229
7230 void
thread_clear_eager_preempt(thread_t thread)7231 thread_clear_eager_preempt(thread_t thread)
7232 {
7233 spl_t s = splsched();
7234 thread_lock(thread);
7235
7236 assert(thread_is_eager_preempt(thread));
7237
7238 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
7239
7240 if (thread == current_thread()) {
7241 current_processor()->current_is_eagerpreempt = false;
7242 }
7243
7244 thread_unlock(thread);
7245 splx(s);
7246 }
7247
7248 /*
7249 * Scheduling statistics
7250 */
7251 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)7252 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
7253 {
7254 struct sched_statistics *stats;
7255 boolean_t to_realtime = FALSE;
7256
7257 stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
7258 stats->csw_count++;
7259
7260 if (otherpri >= BASEPRI_REALTIME) {
7261 stats->rt_sched_count++;
7262 to_realtime = TRUE;
7263 }
7264
7265 if ((reasons & AST_PREEMPT) != 0) {
7266 stats->preempt_count++;
7267
7268 if (selfpri >= BASEPRI_REALTIME) {
7269 stats->preempted_rt_count++;
7270 }
7271
7272 if (to_realtime) {
7273 stats->preempted_by_rt_count++;
7274 }
7275 }
7276 }
7277
7278 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)7279 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
7280 {
7281 uint64_t timestamp = mach_absolute_time();
7282
7283 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
7284 stats->last_change_timestamp = timestamp;
7285 }
7286
7287 /*
7288 * For calls from assembly code
7289 */
7290 #undef thread_wakeup
7291 void
7292 thread_wakeup(
7293 event_t x);
7294
7295 void
thread_wakeup(event_t x)7296 thread_wakeup(
7297 event_t x)
7298 {
7299 thread_wakeup_with_result(x, THREAD_AWAKENED);
7300 }
7301
7302 boolean_t
preemption_enabled(void)7303 preemption_enabled(void)
7304 {
7305 return get_preemption_level() == 0 && ml_get_interrupts_enabled();
7306 }
7307
7308 static void
sched_timer_deadline_tracking_init(void)7309 sched_timer_deadline_tracking_init(void)
7310 {
7311 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
7312 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
7313 }
7314
7315 static uint64_t latest_requested_powered_cores = ALL_CORES_POWERED;
7316 processor_reason_t latest_requested_reason = REASON_NONE;
7317 static uint64_t current_requested_powered_cores = ALL_CORES_POWERED;
7318 bool perfcontrol_sleep_override = false;
7319
7320 LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown");
7321 LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp);
7322 int32_t cluster_powerdown_suspend_count = 0;
7323
7324 bool
sched_is_in_sleep(void)7325 sched_is_in_sleep(void)
7326 {
7327 os_atomic_thread_fence(acquire);
7328 return perfcontrol_sleep_override;
7329 }
7330
7331 static void
sched_update_powered_cores_continue(void)7332 sched_update_powered_cores_continue(void)
7333 {
7334 lck_mtx_lock(&cluster_powerdown_lock);
7335
7336 if (!cluster_powerdown_suspend_count) {
7337 spl_t s = splsched();
7338 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7339
7340 uint64_t latest = latest_requested_powered_cores;
7341 processor_reason_t reason = latest_requested_reason;
7342 uint64_t current = current_requested_powered_cores;
7343 current_requested_powered_cores = latest;
7344 bool in_sleep = perfcontrol_sleep_override;
7345
7346 simple_unlock(&sched_available_cores_lock);
7347 splx(s);
7348
7349 while (latest != current) {
7350 if (!in_sleep) {
7351 assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7352 sched_update_powered_cores(latest, reason, SHUTDOWN_TEMPORARY | WAIT_FOR_LAST_START);
7353 }
7354
7355 s = splsched();
7356 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7357
7358 latest = latest_requested_powered_cores;
7359 reason = latest_requested_reason;
7360 current = current_requested_powered_cores;
7361 current_requested_powered_cores = latest;
7362 in_sleep = perfcontrol_sleep_override;
7363
7364 simple_unlock(&sched_available_cores_lock);
7365 splx(s);
7366 }
7367
7368 assert_wait((event_t)sched_update_powered_cores_continue, THREAD_UNINT);
7369
7370 s = splsched();
7371 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7372 if (latest_requested_powered_cores != current_requested_powered_cores) {
7373 clear_wait(current_thread(), THREAD_AWAKENED);
7374 }
7375 simple_unlock(&sched_available_cores_lock);
7376 splx(s);
7377 }
7378
7379 lck_mtx_unlock(&cluster_powerdown_lock);
7380
7381 thread_block((thread_continue_t)sched_update_powered_cores_continue);
7382 /*NOTREACHED*/
7383 }
7384
7385 void
sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,__unused uint32_t flags)7386 sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, __unused uint32_t flags)
7387 {
7388 assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7389
7390 #if DEVELOPMENT || DEBUG
7391 if (flags & (ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED)) {
7392 if (flags & ASSERT_POWERDOWN_SUSPENDED) {
7393 assert(cluster_powerdown_suspend_count > 0);
7394 }
7395 if (flags & ASSERT_IN_SLEEP) {
7396 assert(perfcontrol_sleep_override == true);
7397 }
7398 return;
7399 }
7400 #endif
7401
7402 spl_t s = splsched();
7403 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7404
7405 bool should_wakeup = !cluster_powerdown_suspend_count;
7406 if (should_wakeup) {
7407 latest_requested_powered_cores = requested_powered_cores;
7408 latest_requested_reason = reason;
7409 }
7410
7411 simple_unlock(&sched_available_cores_lock);
7412 splx(s);
7413
7414 if (should_wakeup) {
7415 thread_wakeup((event_t)sched_update_powered_cores_continue);
7416 }
7417 }
7418
7419 void
suspend_cluster_powerdown(void)7420 suspend_cluster_powerdown(void)
7421 {
7422 lck_mtx_lock(&cluster_powerdown_lock);
7423
7424 assert(cluster_powerdown_suspend_count >= 0);
7425
7426 bool first_suspend = (cluster_powerdown_suspend_count == 0);
7427 if (first_suspend) {
7428 spl_t s = splsched();
7429 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7430 latest_requested_powered_cores = ALL_CORES_POWERED;
7431 current_requested_powered_cores = ALL_CORES_POWERED;
7432 latest_requested_reason = REASON_SYSTEM;
7433 simple_unlock(&sched_available_cores_lock);
7434 splx(s);
7435 }
7436
7437 cluster_powerdown_suspend_count++;
7438
7439 if (first_suspend) {
7440 kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START)\n", __FUNCTION__);
7441 sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START);
7442 }
7443
7444 lck_mtx_unlock(&cluster_powerdown_lock);
7445 }
7446
7447 void
resume_cluster_powerdown(void)7448 resume_cluster_powerdown(void)
7449 {
7450 lck_mtx_lock(&cluster_powerdown_lock);
7451
7452 if (cluster_powerdown_suspend_count <= 0) {
7453 panic("resume_cluster_powerdown() called with cluster_powerdown_suspend_count=%d\n", cluster_powerdown_suspend_count);
7454 }
7455
7456 cluster_powerdown_suspend_count--;
7457
7458 bool last_resume = (cluster_powerdown_suspend_count == 0);
7459
7460 if (last_resume) {
7461 spl_t s = splsched();
7462 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7463 latest_requested_powered_cores = ALL_CORES_POWERED;
7464 current_requested_powered_cores = ALL_CORES_POWERED;
7465 latest_requested_reason = REASON_SYSTEM;
7466 simple_unlock(&sched_available_cores_lock);
7467 splx(s);
7468
7469 kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE)\n", __FUNCTION__);
7470 sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE);
7471 }
7472
7473 lck_mtx_unlock(&cluster_powerdown_lock);
7474 }
7475
7476 LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp);
7477 static bool user_suspended_cluster_powerdown = false;
7478
7479 kern_return_t
suspend_cluster_powerdown_from_user(void)7480 suspend_cluster_powerdown_from_user(void)
7481 {
7482 kern_return_t ret = KERN_FAILURE;
7483
7484 lck_mtx_lock(&user_cluster_powerdown_lock);
7485
7486 if (!user_suspended_cluster_powerdown) {
7487 suspend_cluster_powerdown();
7488 user_suspended_cluster_powerdown = true;
7489 ret = KERN_SUCCESS;
7490 }
7491
7492 lck_mtx_unlock(&user_cluster_powerdown_lock);
7493
7494 return ret;
7495 }
7496
7497 kern_return_t
resume_cluster_powerdown_from_user(void)7498 resume_cluster_powerdown_from_user(void)
7499 {
7500 kern_return_t ret = KERN_FAILURE;
7501
7502 lck_mtx_lock(&user_cluster_powerdown_lock);
7503
7504 if (user_suspended_cluster_powerdown) {
7505 resume_cluster_powerdown();
7506 user_suspended_cluster_powerdown = false;
7507 ret = KERN_SUCCESS;
7508 }
7509
7510 lck_mtx_unlock(&user_cluster_powerdown_lock);
7511
7512 return ret;
7513 }
7514
7515 int
get_cluster_powerdown_user_suspended(void)7516 get_cluster_powerdown_user_suspended(void)
7517 {
7518 lck_mtx_lock(&user_cluster_powerdown_lock);
7519
7520 int ret = (int)user_suspended_cluster_powerdown;
7521
7522 lck_mtx_unlock(&user_cluster_powerdown_lock);
7523
7524 return ret;
7525 }
7526
7527 #if DEVELOPMENT || DEBUG
7528 /* Functions to support the temporary sysctl */
7529 static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED;
7530 void
sched_set_powered_cores(int requested_powered_cores)7531 sched_set_powered_cores(int requested_powered_cores)
7532 {
7533 processor_reason_t reason = bit_test(requested_powered_cores, 31) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM;
7534 uint32_t flags = requested_powered_cores & 0x30000000;
7535
7536 saved_requested_powered_cores = requested_powered_cores;
7537
7538 requested_powered_cores = bits(requested_powered_cores, 28, 0);
7539
7540 sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags);
7541 }
7542 int
sched_get_powered_cores(void)7543 sched_get_powered_cores(void)
7544 {
7545 return (int)saved_requested_powered_cores;
7546 }
7547 #endif
7548
7549 /*
7550 * Ensure that all cores are powered and recommended before sleep
7551 */
7552 void
sched_override_available_cores_for_sleep(void)7553 sched_override_available_cores_for_sleep(void)
7554 {
7555 spl_t s = splsched();
7556 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7557
7558 if (perfcontrol_sleep_override == false) {
7559 perfcontrol_sleep_override = true;
7560 #if __arm__ || __arm64__
7561 sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7562 #endif
7563 }
7564
7565 simple_unlock(&sched_available_cores_lock);
7566 splx(s);
7567
7568 suspend_cluster_powerdown();
7569 }
7570
7571 /*
7572 * Restore the previously recommended cores, but leave all cores powered
7573 * after sleep
7574 */
7575 void
sched_restore_available_cores_after_sleep(void)7576 sched_restore_available_cores_after_sleep(void)
7577 {
7578 spl_t s = splsched();
7579 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7580
7581 if (perfcontrol_sleep_override == true) {
7582 perfcontrol_sleep_override = false;
7583 #if __arm__ || __arm64__
7584 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7585 REASON_NONE, 0);
7586 #endif
7587 }
7588
7589 simple_unlock(&sched_available_cores_lock);
7590 splx(s);
7591
7592 resume_cluster_powerdown();
7593 }
7594
7595 #if __arm__ || __arm64__
7596
7597 uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
7598 bool perfcontrol_failsafe_active = false;
7599
7600 uint64_t perfcontrol_failsafe_maintenance_runnable_time;
7601 uint64_t perfcontrol_failsafe_activation_time;
7602 uint64_t perfcontrol_failsafe_deactivation_time;
7603
7604 /* data covering who likely caused it and how long they ran */
7605 #define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */
7606 char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
7607 int perfcontrol_failsafe_pid;
7608 uint64_t perfcontrol_failsafe_tid;
7609 uint64_t perfcontrol_failsafe_thread_timer_at_start;
7610 uint64_t perfcontrol_failsafe_thread_timer_last_seen;
7611 uint64_t perfcontrol_failsafe_recommended_at_trigger;
7612
7613 /*
7614 * Perf controller calls here to update the recommended core bitmask.
7615 * If the failsafe is active, we don't immediately apply the new value.
7616 * Instead, we store the new request and use it after the failsafe deactivates.
7617 *
7618 * If the failsafe is not active, immediately apply the update.
7619 *
7620 * No scheduler locks are held, no other locks are held that scheduler might depend on,
7621 * interrupts are enabled
7622 *
7623 * currently prototype is in osfmk/arm/machine_routines.h
7624 */
7625 void
sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores,processor_reason_t reason,uint32_t flags)7626 sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags)
7627 {
7628 assert(preemption_enabled());
7629
7630 spl_t s = splsched();
7631 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7632
7633 if (reason == REASON_CLPC_SYSTEM) {
7634 perfcontrol_system_requested_recommended_cores = recommended_cores;
7635 } else {
7636 assert(reason == REASON_CLPC_USER);
7637 perfcontrol_user_requested_recommended_cores = recommended_cores;
7638 }
7639
7640 perfcontrol_requested_recommended_cores = perfcontrol_system_requested_recommended_cores & perfcontrol_user_requested_recommended_cores;
7641 perfcontrol_requested_recommended_core_count = __builtin_popcountll(perfcontrol_requested_recommended_cores);
7642
7643 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7644 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores, reason, flags);
7645 } else {
7646 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7647 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7648 perfcontrol_requested_recommended_cores,
7649 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7650 }
7651
7652 simple_unlock(&sched_available_cores_lock);
7653 splx(s);
7654 }
7655
7656 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)7657 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
7658 {
7659 sched_perfcontrol_update_recommended_cores_reason(recommended_cores, REASON_CLPC_USER, 0);
7660 }
7661
7662 /*
7663 * Consider whether we need to activate the recommended cores failsafe
7664 *
7665 * Called from quantum timer interrupt context of a realtime thread
7666 * No scheduler locks are held, interrupts are disabled
7667 */
7668 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)7669 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7670 {
7671 /*
7672 * Check if a realtime thread is starving the system
7673 * and bringing up non-recommended cores would help
7674 *
7675 * TODO: Is this the correct check for recommended == possible cores?
7676 * TODO: Validate the checks without the relevant lock are OK.
7677 */
7678
7679 if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7680 /* keep track of how long the responsible thread runs */
7681 uint64_t cur_th_time = recount_current_thread_time_mach();
7682
7683 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7684
7685 if (perfcontrol_failsafe_active == TRUE &&
7686 cur_thread->thread_id == perfcontrol_failsafe_tid) {
7687 perfcontrol_failsafe_thread_timer_last_seen = cur_th_time;
7688 }
7689
7690 simple_unlock(&sched_available_cores_lock);
7691
7692 /* we're already trying to solve the problem, so bail */
7693 return;
7694 }
7695
7696 /* The failsafe won't help if there are no more processors to enable */
7697 if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7698 return;
7699 }
7700
7701 uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7702
7703 /* Use the maintenance thread as our canary in the coal mine */
7704 thread_t m_thread = sched_maintenance_thread;
7705
7706 /* If it doesn't look bad, nothing to see here */
7707 if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7708 return;
7709 }
7710
7711 /* It looks bad, take the lock to be sure */
7712 thread_lock(m_thread);
7713
7714 if (m_thread->runq == PROCESSOR_NULL ||
7715 (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
7716 m_thread->last_made_runnable_time >= too_long_ago) {
7717 /*
7718 * Maintenance thread is either on cpu or blocked, and
7719 * therefore wouldn't benefit from more cores
7720 */
7721 thread_unlock(m_thread);
7722 return;
7723 }
7724
7725 uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7726
7727 thread_unlock(m_thread);
7728
7729 /*
7730 * There are cores disabled at perfcontrol's recommendation, but the
7731 * system is so overloaded that the maintenance thread can't run.
7732 * That likely means that perfcontrol can't run either, so it can't fix
7733 * the recommendation. We have to kick in a failsafe to keep from starving.
7734 *
7735 * When the maintenance thread has been starved for too long,
7736 * ignore the recommendation from perfcontrol and light up all the cores.
7737 *
7738 * TODO: Consider weird states like boot, sleep, or debugger
7739 */
7740
7741 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7742
7743 if (perfcontrol_failsafe_active == TRUE) {
7744 simple_unlock(&sched_available_cores_lock);
7745 return;
7746 }
7747
7748 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7749 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
7750 perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
7751
7752 perfcontrol_failsafe_active = TRUE;
7753 perfcontrol_failsafe_activation_time = mach_absolute_time();
7754 perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7755 perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7756
7757 /* Capture some data about who screwed up (assuming that the thread on core is at fault) */
7758 task_t task = get_threadtask(cur_thread);
7759 perfcontrol_failsafe_pid = task_pid(task);
7760 strlcpy(perfcontrol_failsafe_name, proc_name_address(get_bsdtask_info(task)), sizeof(perfcontrol_failsafe_name));
7761
7762 perfcontrol_failsafe_tid = cur_thread->thread_id;
7763
7764 /* Blame the thread for time it has run recently */
7765 uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7766
7767 uint64_t last_seen = recount_current_thread_time_mach();
7768
7769 /* Compute the start time of the bad behavior in terms of the thread's on core time */
7770 perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
7771 perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7772
7773 /* Ignore the previously recommended core configuration */
7774 sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7775
7776 simple_unlock(&sched_available_cores_lock);
7777 }
7778
7779 /*
7780 * Now that our bacon has been saved by the failsafe, consider whether to turn it off
7781 *
7782 * Runs in the context of the maintenance thread, no locks held
7783 */
7784 static void
sched_recommended_cores_maintenance(void)7785 sched_recommended_cores_maintenance(void)
7786 {
7787 /* Common case - no failsafe, nothing to be done here */
7788 if (__probable(perfcontrol_failsafe_active == FALSE)) {
7789 return;
7790 }
7791
7792 uint64_t ctime = mach_absolute_time();
7793
7794 boolean_t print_diagnostic = FALSE;
7795 char p_name[FAILSAFE_NAME_LEN] = "";
7796
7797 spl_t s = splsched();
7798 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7799
7800 /* Check again, under the lock, to avoid races */
7801 if (perfcontrol_failsafe_active == FALSE) {
7802 goto out;
7803 }
7804
7805 /*
7806 * Ensure that the other cores get another few ticks to run some threads
7807 * If we don't have this hysteresis, the maintenance thread is the first
7808 * to run, and then it immediately kills the other cores
7809 */
7810 if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7811 goto out;
7812 }
7813
7814 /* Capture some diagnostic state under the lock so we can print it out later */
7815
7816 int pid = perfcontrol_failsafe_pid;
7817 uint64_t tid = perfcontrol_failsafe_tid;
7818
7819 uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
7820 perfcontrol_failsafe_thread_timer_at_start;
7821 uint64_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
7822 uint64_t rec_cores_after = perfcontrol_requested_recommended_cores;
7823 uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
7824 strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
7825
7826 print_diagnostic = TRUE;
7827
7828 /* Deactivate the failsafe and reinstate the requested recommendation settings */
7829
7830 perfcontrol_failsafe_deactivation_time = ctime;
7831 perfcontrol_failsafe_active = FALSE;
7832
7833 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7834 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
7835 perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
7836
7837 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7838 REASON_NONE, 0);
7839
7840 out:
7841 simple_unlock(&sched_available_cores_lock);
7842 splx(s);
7843
7844 if (print_diagnostic) {
7845 uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
7846
7847 absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
7848 failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7849
7850 absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
7851 thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7852
7853 printf("recommended core failsafe kicked in for %lld ms "
7854 "likely due to %s[%d] thread 0x%llx spending "
7855 "%lld ms on cpu at realtime priority - "
7856 "new recommendation: 0x%llx -> 0x%llx\n",
7857 failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7858 rec_cores_before, rec_cores_after);
7859 }
7860 }
7861
7862 #endif /* __arm64__ */
7863
7864 kern_return_t
sched_processor_enable(processor_t processor,boolean_t enable)7865 sched_processor_enable(processor_t processor, boolean_t enable)
7866 {
7867 assert(preemption_enabled());
7868
7869 if (processor == master_processor) {
7870 /* The system can hang if this is allowed */
7871 return KERN_NOT_SUPPORTED;
7872 }
7873
7874 spl_t s = splsched();
7875 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7876
7877 if (enable) {
7878 bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7879 } else {
7880 bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7881 }
7882
7883 #if __arm64__
7884 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7885 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7886 REASON_USER, 0);
7887 } else {
7888 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7889 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7890 perfcontrol_requested_recommended_cores,
7891 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7892 }
7893 #else /* __arm64__ */
7894 sched_update_recommended_cores(usercontrol_requested_recommended_cores, REASON_USER, 0);
7895 #endif /* ! __arm64__ */
7896
7897 simple_unlock(&sched_available_cores_lock);
7898 splx(s);
7899
7900 return KERN_SUCCESS;
7901 }
7902
7903 void
sched_mark_processor_online_locked(processor_t processor,__assert_only processor_reason_t reason)7904 sched_mark_processor_online_locked(processor_t processor, __assert_only processor_reason_t reason)
7905 {
7906 assert((processor != master_processor) || (reason == REASON_SYSTEM));
7907
7908 bit_set(sched_online_processors, processor->cpu_id);
7909 }
7910
7911 kern_return_t
sched_mark_processor_offline(processor_t processor,processor_reason_t reason)7912 sched_mark_processor_offline(processor_t processor, processor_reason_t reason)
7913 {
7914 assert((processor != master_processor) || (reason == REASON_SYSTEM));
7915 kern_return_t ret = KERN_SUCCESS;
7916
7917 spl_t s = splsched();
7918 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7919
7920 if (reason == REASON_SYSTEM) {
7921 bit_clear(sched_online_processors, processor->cpu_id);
7922 simple_unlock(&sched_available_cores_lock);
7923 splx(s);
7924 return ret;
7925 }
7926
7927 uint64_t available_cores = sched_online_processors & perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores;
7928
7929 if (!bit_test(sched_online_processors, processor->cpu_id)) {
7930 /* Processor is already offline */
7931 ret = KERN_NOT_IN_SET;
7932 } else if (available_cores == BIT(processor->cpu_id)) {
7933 ret = KERN_RESOURCE_SHORTAGE;
7934 } else {
7935 bit_clear(sched_online_processors, processor->cpu_id);
7936 ret = KERN_SUCCESS;
7937 }
7938
7939 simple_unlock(&sched_available_cores_lock);
7940 splx(s);
7941
7942 return ret;
7943 }
7944
7945 /*
7946 * Apply a new recommended cores mask to the processors it affects
7947 * Runs after considering failsafes and such
7948 *
7949 * Iterate over processors and update their ->is_recommended field.
7950 * If a processor is running, we let it drain out at its next
7951 * quantum expiration or blocking point. If a processor is idle, there
7952 * may be more work for it to do, so IPI it.
7953 *
7954 * interrupts disabled, sched_available_cores_lock is held
7955 */
7956 static void
sched_update_recommended_cores(uint64_t recommended_cores,processor_reason_t reason,__unused uint32_t flags)7957 sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, __unused uint32_t flags)
7958 {
7959 uint64_t needs_exit_idle_mask = 0x0;
7960
7961 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
7962 recommended_cores,
7963 #if __arm64__
7964 perfcontrol_failsafe_active, 0, 0);
7965 #else /* __arm64__ */
7966 0, 0, 0);
7967 #endif /* ! __arm64__ */
7968
7969 if (__builtin_popcountll(recommended_cores & sched_online_processors) == 0) {
7970 bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
7971 }
7972
7973 /* First set recommended cores */
7974 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7975 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7976 processor_set_t pset = pset_array[pset_id];
7977
7978 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7979 cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7980
7981 if (newly_recommended == 0) {
7982 /* Nothing to do */
7983 continue;
7984 }
7985
7986 pset_lock(pset);
7987
7988 for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0; cpu_id = lsb_next(newly_recommended, cpu_id)) {
7989 processor_t processor = processor_array[cpu_id];
7990 processor->is_recommended = TRUE;
7991 processor->last_recommend_reason = reason;
7992 bit_set(pset->recommended_bitmask, processor->cpu_id);
7993
7994 if (processor->state == PROCESSOR_IDLE) {
7995 if (processor != current_processor()) {
7996 bit_set(needs_exit_idle_mask, processor->cpu_id);
7997 }
7998 }
7999 if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8000 os_atomic_inc(&processor_avail_count_user, relaxed);
8001 if (processor->processor_primary == processor) {
8002 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
8003 }
8004 SCHED(pset_made_schedulable)(processor, pset, false);
8005 }
8006 }
8007 pset_update_rt_stealable_state(pset);
8008
8009 pset_unlock(pset);
8010
8011 for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0;
8012 cpu_id = lsb_next(newly_recommended, cpu_id)) {
8013 smr_cpu_up(processor_array[cpu_id],
8014 SMR_CPU_REASON_IGNORED);
8015 }
8016 }
8017 }
8018
8019 /* Now shutdown not recommended cores */
8020 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8021 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8022 processor_set_t pset = pset_array[pset_id];
8023
8024 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
8025 cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
8026
8027 if (newly_unrecommended == 0) {
8028 /* Nothing to do */
8029 continue;
8030 }
8031
8032 pset_lock(pset);
8033
8034 for (int cpu_id = lsb_first(newly_unrecommended); cpu_id >= 0; cpu_id = lsb_next(newly_unrecommended, cpu_id)) {
8035 processor_t processor = processor_array[cpu_id];
8036 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
8037
8038 processor->is_recommended = FALSE;
8039 if (reason != REASON_NONE) {
8040 processor->last_derecommend_reason = reason;
8041 }
8042 bit_clear(pset->recommended_bitmask, processor->cpu_id);
8043 if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8044 os_atomic_dec(&processor_avail_count_user, relaxed);
8045 if (processor->processor_primary == processor) {
8046 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
8047 }
8048 }
8049 pset_update_rt_stealable_state(pset);
8050
8051 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
8052 ipi_type = SCHED_IPI_IMMEDIATE;
8053 }
8054 SCHED(processor_queue_shutdown)(processor);
8055 /* pset unlocked */
8056
8057 SCHED(rt_queue_shutdown)(processor);
8058
8059 if (ipi_type == SCHED_IPI_NONE) {
8060 /*
8061 * If the core is idle,
8062 * we can directly mark the processor
8063 * as "Ignored"
8064 *
8065 * Otherwise, smr will detect this
8066 * during smr_cpu_leave() when the
8067 * processor actually idles.
8068 */
8069 smr_cpu_down(processor, SMR_CPU_REASON_IGNORED);
8070 } else if (processor == current_processor()) {
8071 ast_on(AST_PREEMPT);
8072 } else {
8073 sched_ipi_perform(processor, ipi_type);
8074 }
8075
8076 pset_lock(pset);
8077 }
8078 pset_unlock(pset);
8079 }
8080 }
8081
8082 #if defined(__x86_64__)
8083 commpage_update_active_cpus();
8084 #endif
8085 /* Issue all pending IPIs now that the pset lock has been dropped */
8086 for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
8087 processor_t processor = processor_array[cpuid];
8088 machine_signal_idle(processor);
8089 }
8090
8091 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
8092 needs_exit_idle_mask, 0, 0, 0);
8093 }
8094
8095 static void
sched_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,uint32_t flags)8096 sched_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, uint32_t flags)
8097 {
8098 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_START,
8099 requested_powered_cores, reason, flags, 0);
8100
8101 assert((flags & (LOCK_STATE | UNLOCK_STATE)) ? (reason == REASON_SYSTEM) && (requested_powered_cores == ALL_CORES_POWERED) : 1);
8102
8103 /*
8104 * Loop through newly set requested_powered_cores and start them.
8105 * Loop through newly cleared requested_powered_cores and shut them down.
8106 */
8107
8108 if ((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER)) {
8109 flags |= SHUTDOWN_TEMPORARY;
8110 }
8111
8112 /* First set powered cores */
8113 cpumap_t started_cores = 0ull;
8114 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8115 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8116 processor_set_t pset = pset_array[pset_id];
8117
8118 spl_t s = splsched();
8119 pset_lock(pset);
8120 cpumap_t pset_requested_powered_cores = requested_powered_cores & pset->cpu_bitmask;
8121 cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8122 cpumap_t requested_changes = pset_requested_powered_cores ^ powered_cores;
8123 pset_unlock(pset);
8124 splx(s);
8125
8126 cpumap_t newly_powered = requested_changes & requested_powered_cores;
8127
8128 cpumap_t cpu_map = newly_powered;
8129
8130 if (flags & (LOCK_STATE | UNLOCK_STATE)) {
8131 /*
8132 * We need to change the lock state even if
8133 * we don't need to change the actual state.
8134 */
8135 cpu_map = pset_requested_powered_cores;
8136 /* But not the master_processor, which is always implicitly locked */
8137 bit_clear(cpu_map, master_processor->cpu_id);
8138 }
8139
8140 if (cpu_map == 0) {
8141 /* Nothing to do */
8142 continue;
8143 }
8144
8145 for (int cpu_id = lsb_first(cpu_map); cpu_id >= 0; cpu_id = lsb_next(cpu_map, cpu_id)) {
8146 processor_t processor = processor_array[cpu_id];
8147 processor_start_reason(processor, reason, flags);
8148 bit_set(started_cores, cpu_id);
8149 }
8150 }
8151 }
8152 if (flags & WAIT_FOR_LAST_START) {
8153 for (int cpu_id = lsb_first(started_cores); cpu_id >= 0; cpu_id = lsb_next(started_cores, cpu_id)) {
8154 processor_t processor = processor_array[cpu_id];
8155 processor_wait_for_start(processor);
8156 }
8157 }
8158
8159 /* Now shutdown not powered cores */
8160 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8161 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8162 processor_set_t pset = pset_array[pset_id];
8163
8164 spl_t s = splsched();
8165 pset_lock(pset);
8166 cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8167 cpumap_t requested_changes = (requested_powered_cores & pset->cpu_bitmask) ^ powered_cores;
8168 pset_unlock(pset);
8169 splx(s);
8170
8171 cpumap_t newly_unpowered = requested_changes & ~requested_powered_cores;
8172
8173 if (newly_unpowered == 0) {
8174 /* Nothing to do */
8175 continue;
8176 }
8177
8178 for (int cpu_id = lsb_first(newly_unpowered); cpu_id >= 0; cpu_id = lsb_next(newly_unpowered, cpu_id)) {
8179 processor_t processor = processor_array[cpu_id];
8180
8181 processor_exit_reason(processor, reason, flags);
8182 }
8183 }
8184 }
8185
8186 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_END, 0, 0, 0, 0);
8187 }
8188
8189 void
thread_set_options(uint32_t thopt)8190 thread_set_options(uint32_t thopt)
8191 {
8192 spl_t x;
8193 thread_t t = current_thread();
8194
8195 x = splsched();
8196 thread_lock(t);
8197
8198 t->options |= thopt;
8199
8200 thread_unlock(t);
8201 splx(x);
8202 }
8203
8204 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)8205 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
8206 {
8207 thread->pending_block_hint = block_hint;
8208 }
8209
8210 uint32_t
qos_max_parallelism(int qos,uint64_t options)8211 qos_max_parallelism(int qos, uint64_t options)
8212 {
8213 return SCHED(qos_max_parallelism)(qos, options);
8214 }
8215
8216 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)8217 sched_qos_max_parallelism(__unused int qos, uint64_t options)
8218 {
8219 host_basic_info_data_t hinfo;
8220 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
8221
8222
8223 /*
8224 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
8225 * implement their own qos_max_parallelism() interfaces.
8226 */
8227 assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
8228
8229 /* Query the machine layer for core information */
8230 __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
8231 (host_info_t)&hinfo, &count);
8232 assert(kret == KERN_SUCCESS);
8233
8234 if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
8235 return hinfo.logical_cpu;
8236 } else {
8237 return hinfo.physical_cpu;
8238 }
8239 }
8240
8241 int sched_allow_NO_SMT_threads = 1;
8242 bool
thread_no_smt(thread_t thread)8243 thread_no_smt(thread_t thread)
8244 {
8245 return sched_allow_NO_SMT_threads &&
8246 (thread->bound_processor == PROCESSOR_NULL) &&
8247 ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
8248 }
8249
8250 bool
processor_active_thread_no_smt(processor_t processor)8251 processor_active_thread_no_smt(processor_t processor)
8252 {
8253 return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
8254 }
8255
8256 #if __arm64__
8257
8258 /*
8259 * Set up or replace old timer with new timer
8260 *
8261 * Returns true if canceled old timer, false if it did not
8262 */
8263 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)8264 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
8265 {
8266 /*
8267 * Exchange deadline for new deadline, if old deadline was nonzero,
8268 * then I cancelled the callback, otherwise I didn't
8269 */
8270
8271 return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
8272 relaxed) != 0;
8273 }
8274
8275 /*
8276 * Set global SFI window (in usec)
8277 */
8278 kern_return_t
sched_perfcontrol_sfi_set_window(uint64_t window_usecs)8279 sched_perfcontrol_sfi_set_window(uint64_t window_usecs)
8280 {
8281 kern_return_t ret = KERN_NOT_SUPPORTED;
8282 #if CONFIG_THREAD_GROUPS
8283 if (window_usecs == 0ULL) {
8284 ret = sfi_window_cancel();
8285 } else {
8286 ret = sfi_set_window(window_usecs);
8287 }
8288 #endif // CONFIG_THREAD_GROUPS
8289 return ret;
8290 }
8291
8292 /*
8293 * Set background and maintenance SFI class offtimes
8294 */
8295 kern_return_t
sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)8296 sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)
8297 {
8298 kern_return_t ret = KERN_NOT_SUPPORTED;
8299 #if CONFIG_THREAD_GROUPS
8300 if (offtime_usecs == 0ULL) {
8301 ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE);
8302 ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG);
8303 } else {
8304 ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs);
8305 ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs);
8306 }
8307 #endif // CONFIG_THREAD_GROUPS
8308 return ret;
8309 }
8310
8311 /*
8312 * Set utility SFI class offtime
8313 */
8314 kern_return_t
sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)8315 sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)
8316 {
8317 kern_return_t ret = KERN_NOT_SUPPORTED;
8318 #if CONFIG_THREAD_GROUPS
8319 if (offtime_usecs == 0ULL) {
8320 ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY);
8321 } else {
8322 ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs);
8323 }
8324 #endif // CONFIG_THREAD_GROUPS
8325 return ret;
8326 }
8327
8328 #endif /* __arm64__ */
8329
8330 #if CONFIG_SCHED_EDGE
8331
8332 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
8333
8334 /*
8335 * sched_edge_pset_running_higher_bucket()
8336 *
8337 * Routine to calculate cumulative running counts for each scheduling
8338 * bucket. This effectively lets the load calculation calculate if a
8339 * cluster is running any threads at a QoS lower than the thread being
8340 * migrated etc.
8341 */
8342
8343 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)8344 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
8345 {
8346 bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
8347
8348 /* Edge Scheduler Optimization */
8349 for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
8350 sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
8351 for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
8352 running_higher[bucket]++;
8353 }
8354 }
8355 }
8356
8357 /*
8358 * sched_update_pset_load_average()
8359 *
8360 * Updates the load average for each sched bucket for a cluster.
8361 * This routine must be called with the pset lock held.
8362 */
8363 void
sched_update_pset_load_average(processor_set_t pset,uint64_t curtime)8364 sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
8365 {
8366 int avail_cpu_count = pset_available_cpu_count(pset);
8367 if (avail_cpu_count == 0) {
8368 /* Looks like the pset is not runnable any more; nothing to do here */
8369 return;
8370 }
8371
8372 /*
8373 * Edge Scheduler Optimization
8374 *
8375 * See if more callers of this routine can pass in timestamps to avoid the
8376 * mach_absolute_time() call here.
8377 */
8378
8379 if (!curtime) {
8380 curtime = mach_absolute_time();
8381 }
8382 uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
8383 int64_t delta_ticks = curtime - last_update;
8384 if (delta_ticks < 0) {
8385 return;
8386 }
8387
8388 uint64_t delta_nsecs = 0;
8389 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8390
8391 if (__improbable(delta_nsecs > UINT32_MAX)) {
8392 delta_nsecs = UINT32_MAX;
8393 }
8394
8395 #if CONFIG_SCHED_EDGE
8396 /* Update the shared resource load on the pset */
8397 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
8398 uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
8399 uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
8400 uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
8401 uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
8402 if (old_shared_load != new_shared_load) {
8403 KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
8404 }
8405 }
8406 #endif /* CONFIG_SCHED_EDGE */
8407
8408 uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
8409 sched_edge_pset_running_higher_bucket(pset, running_higher);
8410
8411 for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
8412 uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
8413 uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
8414 uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
8415
8416 /*
8417 * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
8418 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
8419 * new load averga needs to be shifted before it can be added to the old load average.
8420 */
8421 uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
8422
8423 /*
8424 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
8425 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
8426 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
8427 */
8428 int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8429 boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
8430 boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
8431 uint64_t load_average;
8432 if (load_uptick || load_downtick) {
8433 load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8434 } else {
8435 /* Indicates a loaded system; use EWMA for load average calculation */
8436 load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8437 }
8438 os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
8439 if (load_average != old_load_average) {
8440 KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
8441 }
8442 }
8443 os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
8444 }
8445
8446 void
sched_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)8447 sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
8448 {
8449 pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
8450 uint64_t avg_thread_execution_time = 0;
8451
8452 os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
8453 old_execution_time_packed.pset_execution_time_packed,
8454 new_execution_time_packed.pset_execution_time_packed, relaxed, {
8455 uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
8456 int64_t delta_ticks = curtime - last_update;
8457 if (delta_ticks < 0) {
8458 /*
8459 * Its possible that another CPU came in and updated the pset_execution_time
8460 * before this CPU could do it. Since the average execution time is meant to
8461 * be an approximate measure per cluster, ignore the older update.
8462 */
8463 os_atomic_rmw_loop_give_up(return );
8464 }
8465 uint64_t delta_nsecs = 0;
8466 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8467
8468 uint64_t nanotime = 0;
8469 absolutetime_to_nanoseconds(execution_time, &nanotime);
8470 uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
8471
8472 uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
8473 uint64_t new_execution_time = (execution_time_us * delta_nsecs);
8474
8475 avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8476 new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
8477 new_execution_time_packed.pset_execution_time_last_update = curtime;
8478 });
8479 if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
8480 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
8481 }
8482 }
8483
8484 uint64_t
sched_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)8485 sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
8486 {
8487 return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
8488 }
8489
8490 #else /* CONFIG_SCHED_EDGE */
8491
8492 void
sched_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)8493 sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
8494 {
8495 int non_rt_load = pset->pset_runq.count;
8496 int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
8497 int new_load_average = ((int)pset->load_average + load) >> 1;
8498
8499 pset->load_average = new_load_average;
8500 #if (DEVELOPMENT || DEBUG)
8501 #if __AMP__
8502 if (pset->pset_cluster_type == PSET_AMP_P) {
8503 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
8504 }
8505 #endif
8506 #endif
8507 }
8508
8509 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)8510 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
8511 {
8512 }
8513
8514 #endif /* CONFIG_SCHED_EDGE */
8515
8516 /* pset is locked */
8517 static bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)8518 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
8519 {
8520 int cpuid = processor->cpu_id;
8521 #if defined(__x86_64__)
8522 if (sched_avoid_cpu0 && (cpuid == 0)) {
8523 return false;
8524 }
8525 #endif
8526
8527 cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8528
8529 return bit_test(fasttrack_map, cpuid);
8530 }
8531
8532 /* pset is locked */
8533 static processor_t
choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)8534 choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
8535 {
8536 #if defined(__x86_64__)
8537 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8538 #else
8539 const bool avoid_cpu0 = false;
8540 #endif
8541 cpumap_t cpu_map;
8542
8543 try_again:
8544 cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8545 if (skip_processor) {
8546 bit_clear(cpu_map, skip_processor->cpu_id);
8547 }
8548 if (skip_spills) {
8549 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8550 }
8551
8552 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8553 bit_clear(cpu_map, 0);
8554 }
8555
8556 cpumap_t primary_map = cpu_map & pset->primary_map;
8557 if (avoid_cpu0) {
8558 primary_map = bit_ror64(primary_map, 1);
8559 }
8560
8561 int rotid = lsb_first(primary_map);
8562 if (rotid >= 0) {
8563 int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
8564
8565 processor_t processor = processor_array[cpuid];
8566
8567 return processor;
8568 }
8569
8570 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8571 goto out;
8572 }
8573
8574 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8575 /* Also avoid cpu1 */
8576 bit_clear(cpu_map, 1);
8577 }
8578
8579 /* Consider secondary processors whose primary is actually running a realtime thread */
8580 cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
8581 if (avoid_cpu0) {
8582 /* Also avoid cpu1 */
8583 secondary_map = bit_ror64(secondary_map, 2);
8584 }
8585 rotid = lsb_first(secondary_map);
8586 if (rotid >= 0) {
8587 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
8588
8589 processor_t processor = processor_array[cpuid];
8590
8591 return processor;
8592 }
8593
8594 /* Consider secondary processors */
8595 secondary_map = cpu_map & ~pset->primary_map;
8596 if (avoid_cpu0) {
8597 /* Also avoid cpu1 */
8598 secondary_map = bit_ror64(secondary_map, 2);
8599 }
8600 rotid = lsb_first(secondary_map);
8601 if (rotid >= 0) {
8602 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
8603
8604 processor_t processor = processor_array[cpuid];
8605
8606 return processor;
8607 }
8608
8609 /*
8610 * I was hoping the compiler would optimize
8611 * this away when avoid_cpu0 is const bool false
8612 * but it still complains about the assignmnent
8613 * in that case.
8614 */
8615 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8616 #if defined(__x86_64__)
8617 avoid_cpu0 = false;
8618 #else
8619 assert(0);
8620 #endif
8621 goto try_again;
8622 }
8623
8624 out:
8625 if (skip_processor) {
8626 return PROCESSOR_NULL;
8627 }
8628
8629 /*
8630 * If we didn't find an obvious processor to choose, but there are still more CPUs
8631 * not already running realtime threads than realtime threads in the realtime run queue,
8632 * this thread belongs in this pset, so choose some other processor in this pset
8633 * to ensure the thread is enqueued here.
8634 */
8635 cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8636 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8637 cpu_map = non_realtime_map;
8638 assert(cpu_map != 0);
8639 int cpuid = bit_first(cpu_map);
8640 assert(cpuid >= 0);
8641 return processor_array[cpuid];
8642 }
8643
8644 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8645 goto skip_secondaries;
8646 }
8647
8648 non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
8649 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8650 cpu_map = non_realtime_map;
8651 assert(cpu_map != 0);
8652 int cpuid = bit_first(cpu_map);
8653 assert(cpuid >= 0);
8654 return processor_array[cpuid];
8655 }
8656
8657 skip_secondaries:
8658 return PROCESSOR_NULL;
8659 }
8660
8661 /*
8662 * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
8663 * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
8664 *
8665 * pset is locked.
8666 */
8667 static processor_t
choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)8668 choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
8669 {
8670 uint64_t furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon);
8671 processor_t fd_processor = PROCESSOR_NULL;
8672 int lowest_priority = max_pri;
8673
8674 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
8675 if (skip_processor) {
8676 bit_clear(cpu_map, skip_processor->cpu_id);
8677 }
8678 if (skip_spills) {
8679 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8680 }
8681
8682 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8683 processor_t processor = processor_array[cpuid];
8684
8685 if (processor->current_pri > lowest_priority) {
8686 continue;
8687 }
8688
8689 if (processor->current_pri < lowest_priority) {
8690 lowest_priority = processor->current_pri;
8691 furthest_deadline = processor->deadline;
8692 fd_processor = processor;
8693 continue;
8694 }
8695
8696 if (processor->deadline > furthest_deadline) {
8697 furthest_deadline = processor->deadline;
8698 fd_processor = processor;
8699 }
8700 }
8701
8702 if (fd_processor) {
8703 return fd_processor;
8704 }
8705
8706 /*
8707 * There is a race condition possible when there are multiple processor sets.
8708 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
8709 * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
8710 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
8711 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
8712 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
8713 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
8714 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
8715 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
8716 *
8717 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
8718 * on the run queue of that pset.
8719 */
8720 if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
8721 cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
8722 assert(skip_processor == PROCESSOR_NULL);
8723 assert(skip_spills == false);
8724
8725 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8726 processor_t processor = processor_array[cpuid];
8727
8728 if (processor->current_pri > lowest_priority) {
8729 continue;
8730 }
8731
8732 if (processor->current_pri < lowest_priority) {
8733 lowest_priority = processor->current_pri;
8734 furthest_deadline = processor->deadline;
8735 fd_processor = processor;
8736 continue;
8737 }
8738
8739 if (processor->deadline > furthest_deadline) {
8740 furthest_deadline = processor->deadline;
8741 fd_processor = processor;
8742 }
8743 }
8744 }
8745
8746 return fd_processor;
8747 }
8748
8749 /* pset is locked */
8750 static processor_t
choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)8751 choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
8752 {
8753 bool skip_spills = true;
8754 bool include_ast_urgent_pending_cpus = false;
8755
8756 processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
8757 if (next_processor != PROCESSOR_NULL) {
8758 return next_processor;
8759 }
8760
8761 next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
8762 return next_processor;
8763 }
8764
8765 #if defined(__x86_64__)
8766 /* pset is locked */
8767 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)8768 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
8769 {
8770 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8771 int nbackup_cpus = 0;
8772
8773 if (include_backups && rt_runq_is_low_latency(pset)) {
8774 nbackup_cpus = sched_rt_n_backup_processors;
8775 }
8776
8777 cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8778 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8779 bit_clear(cpu_map, 0);
8780 }
8781 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8782 }
8783
8784 /* pset is locked */
8785 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)8786 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
8787 {
8788 int nbackup_cpus = 0;
8789
8790 if (include_backups && rt_runq_is_low_latency(pset)) {
8791 nbackup_cpus = sched_rt_n_backup_processors;
8792 }
8793
8794 cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
8795 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8796 }
8797 #endif
8798
8799 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)8800 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
8801 {
8802 if (!processor->is_recommended) {
8803 return false;
8804 }
8805 bool ok_to_run_realtime_thread = true;
8806 #if defined(__x86_64__)
8807 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
8808 if (spill_pending) {
8809 return true;
8810 }
8811 if (processor->cpu_id == 0) {
8812 if (sched_avoid_cpu0 == 1) {
8813 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
8814 } else if (sched_avoid_cpu0 == 2) {
8815 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
8816 }
8817 } else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
8818 ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
8819 } else if (processor->processor_primary != processor) {
8820 ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
8821 }
8822 #else
8823 (void)pset;
8824 (void)processor;
8825 (void)as_backup;
8826 #endif
8827 return ok_to_run_realtime_thread;
8828 }
8829
8830 void
sched_pset_made_schedulable(__unused processor_t processor,processor_set_t pset,boolean_t drop_lock)8831 sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
8832 {
8833 if (drop_lock) {
8834 pset_unlock(pset);
8835 }
8836 }
8837
8838 void
thread_set_no_smt(bool set)8839 thread_set_no_smt(bool set)
8840 {
8841 if (!system_is_SMT) {
8842 /* Not a machine that supports SMT */
8843 return;
8844 }
8845
8846 thread_t thread = current_thread();
8847
8848 spl_t s = splsched();
8849 thread_lock(thread);
8850 if (set) {
8851 thread->sched_flags |= TH_SFLAG_NO_SMT;
8852 }
8853 thread_unlock(thread);
8854 splx(s);
8855 }
8856
8857 bool
thread_get_no_smt(void)8858 thread_get_no_smt(void)
8859 {
8860 return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8861 }
8862
8863 extern void task_set_no_smt(task_t);
8864 void
task_set_no_smt(task_t task)8865 task_set_no_smt(task_t task)
8866 {
8867 if (!system_is_SMT) {
8868 /* Not a machine that supports SMT */
8869 return;
8870 }
8871
8872 if (task == TASK_NULL) {
8873 task = current_task();
8874 }
8875
8876 task_lock(task);
8877 task->t_flags |= TF_NO_SMT;
8878 task_unlock(task);
8879 }
8880
8881 #if DEBUG || DEVELOPMENT
8882 extern void sysctl_task_set_no_smt(char no_smt);
8883 void
sysctl_task_set_no_smt(char no_smt)8884 sysctl_task_set_no_smt(char no_smt)
8885 {
8886 if (!system_is_SMT) {
8887 /* Not a machine that supports SMT */
8888 return;
8889 }
8890
8891 task_t task = current_task();
8892
8893 task_lock(task);
8894 if (no_smt == '1') {
8895 task->t_flags |= TF_NO_SMT;
8896 }
8897 task_unlock(task);
8898 }
8899
8900 extern char sysctl_task_get_no_smt(void);
8901 char
sysctl_task_get_no_smt(void)8902 sysctl_task_get_no_smt(void)
8903 {
8904 task_t task = current_task();
8905
8906 if (task->t_flags & TF_NO_SMT) {
8907 return '1';
8908 }
8909 return '0';
8910 }
8911 #endif /* DEVELOPMENT || DEBUG */
8912
8913
8914 __private_extern__ void
thread_bind_cluster_type(thread_t thread,char cluster_type,bool soft_bound)8915 thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8916 {
8917 #if __AMP__
8918 spl_t s = splsched();
8919 thread_lock(thread);
8920 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8921 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8922 if (soft_bound) {
8923 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8924 }
8925 switch (cluster_type) {
8926 case 'e':
8927 case 'E':
8928 if (pset0.pset_cluster_type == PSET_AMP_E) {
8929 thread->th_bound_cluster_id = pset0.pset_id;
8930 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8931 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8932 }
8933 break;
8934 case 'p':
8935 case 'P':
8936 if (pset0.pset_cluster_type == PSET_AMP_P) {
8937 thread->th_bound_cluster_id = pset0.pset_id;
8938 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8939 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8940 }
8941 break;
8942 default:
8943 break;
8944 }
8945 thread_unlock(thread);
8946 splx(s);
8947
8948 if (thread == current_thread()) {
8949 thread_block(THREAD_CONTINUE_NULL);
8950 }
8951 #else /* __AMP__ */
8952 (void)thread;
8953 (void)cluster_type;
8954 (void)soft_bound;
8955 #endif /* __AMP__ */
8956 }
8957
8958 extern uint32_t thread_bound_cluster_id(thread_t thread);
8959 uint32_t
thread_bound_cluster_id(thread_t thread)8960 thread_bound_cluster_id(thread_t thread)
8961 {
8962 return thread->th_bound_cluster_id;
8963 }
8964
8965 __private_extern__ kern_return_t
thread_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)8966 thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8967 {
8968 #if __AMP__
8969
8970 processor_set_t pset = NULL;
8971 if (options & (THREAD_BIND_SOFT | THREAD_BIND_ELIGIBLE_ONLY)) {
8972 /* Validate the inputs for the bind case */
8973 int max_clusters = ml_get_cluster_count();
8974 if (cluster_id >= max_clusters) {
8975 /* Invalid cluster id */
8976 return KERN_INVALID_ARGUMENT;
8977 }
8978 pset = pset_array[cluster_id];
8979 if (pset == NULL) {
8980 /* Cluster has not been initialized yet */
8981 return KERN_INVALID_ARGUMENT;
8982 }
8983 if (options & THREAD_BIND_ELIGIBLE_ONLY) {
8984 if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
8985 /* Thread is not recommended for the cluster type */
8986 return KERN_INVALID_POLICY;
8987 }
8988 }
8989 }
8990
8991 if (options & THREAD_UNBIND) {
8992 /* If the thread was actually not bound to some cluster, nothing to do here */
8993 if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8994 return KERN_SUCCESS;
8995 }
8996 }
8997
8998 spl_t s = splsched();
8999 thread_lock(thread);
9000
9001 /* Unbind the thread from its previous bound state */
9002 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
9003 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
9004
9005 if (options & THREAD_UNBIND) {
9006 /* Nothing more to do here */
9007 goto thread_bind_cluster_complete;
9008 }
9009
9010 if (options & THREAD_BIND_SOFT) {
9011 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
9012 }
9013 thread->th_bound_cluster_id = cluster_id;
9014
9015 thread_bind_cluster_complete:
9016 thread_unlock(thread);
9017 splx(s);
9018
9019 if (thread == current_thread()) {
9020 thread_block(THREAD_CONTINUE_NULL);
9021 }
9022 #else /* __AMP__ */
9023 (void)thread;
9024 (void)cluster_id;
9025 (void)options;
9026 #endif /* __AMP__ */
9027 return KERN_SUCCESS;
9028 }
9029
9030 #if DEVELOPMENT || DEBUG
9031 extern int32_t sysctl_get_bound_cpuid(void);
9032 int32_t
sysctl_get_bound_cpuid(void)9033 sysctl_get_bound_cpuid(void)
9034 {
9035 int32_t cpuid = -1;
9036 thread_t self = current_thread();
9037
9038 processor_t processor = self->bound_processor;
9039 if (processor == NULL) {
9040 cpuid = -1;
9041 } else {
9042 cpuid = processor->cpu_id;
9043 }
9044
9045 return cpuid;
9046 }
9047
9048 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
9049 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)9050 sysctl_thread_bind_cpuid(int32_t cpuid)
9051 {
9052 processor_t processor = PROCESSOR_NULL;
9053
9054 if (cpuid == -1) {
9055 goto unbind;
9056 }
9057
9058 if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
9059 return KERN_INVALID_VALUE;
9060 }
9061
9062 processor = processor_array[cpuid];
9063 if (processor == PROCESSOR_NULL) {
9064 return KERN_INVALID_VALUE;
9065 }
9066
9067 #if __AMP__
9068
9069 thread_t thread = current_thread();
9070
9071 if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
9072 if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
9073 /* Cannot hard-bind an already hard-cluster-bound thread */
9074 return KERN_NOT_SUPPORTED;
9075 }
9076 }
9077
9078 #endif /* __AMP__ */
9079
9080 unbind:
9081 thread_bind(processor);
9082
9083 thread_block(THREAD_CONTINUE_NULL);
9084 return KERN_SUCCESS;
9085 }
9086
9087 extern char sysctl_get_task_cluster_type(void);
9088 char
sysctl_get_task_cluster_type(void)9089 sysctl_get_task_cluster_type(void)
9090 {
9091 task_t task = current_task();
9092 processor_set_t pset_hint = task->pset_hint;
9093
9094 if (!pset_hint) {
9095 return '0';
9096 }
9097
9098 #if __AMP__
9099 if (pset_hint->pset_cluster_type == PSET_AMP_E) {
9100 return 'E';
9101 } else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
9102 return 'P';
9103 }
9104 #endif
9105
9106 return '0';
9107 }
9108
9109 #if __AMP__
9110 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)9111 find_pset_of_type(pset_cluster_type_t t)
9112 {
9113 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
9114 if (node->pset_cluster_type != t) {
9115 continue;
9116 }
9117
9118 processor_set_t pset = PROCESSOR_SET_NULL;
9119 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
9120 pset = pset_array[pset_id];
9121 /* Prefer one with recommended processsors */
9122 if (pset->recommended_bitmask != 0) {
9123 assert(pset->pset_cluster_type == t);
9124 return pset;
9125 }
9126 }
9127 /* Otherwise return whatever was found last */
9128 return pset;
9129 }
9130
9131 return PROCESSOR_SET_NULL;
9132 }
9133 #endif
9134
9135 extern void sysctl_task_set_cluster_type(char cluster_type);
9136 void
sysctl_task_set_cluster_type(char cluster_type)9137 sysctl_task_set_cluster_type(char cluster_type)
9138 {
9139 task_t task = current_task();
9140 processor_set_t pset_hint = PROCESSOR_SET_NULL;
9141
9142 #if __AMP__
9143 switch (cluster_type) {
9144 case 'e':
9145 case 'E':
9146 pset_hint = find_pset_of_type(PSET_AMP_E);
9147 break;
9148 case 'p':
9149 case 'P':
9150 pset_hint = find_pset_of_type(PSET_AMP_P);
9151 break;
9152 default:
9153 break;
9154 }
9155
9156 if (pset_hint) {
9157 task_lock(task);
9158 task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
9159 task->pset_hint = pset_hint;
9160 task_unlock(task);
9161
9162 thread_block(THREAD_CONTINUE_NULL);
9163 }
9164 #else
9165 (void)cluster_type;
9166 (void)task;
9167 (void)pset_hint;
9168 #endif
9169 }
9170
9171 /*
9172 * The quantum length used for Fixed and RT sched modes. In general the quantum
9173 * can vary - for example for background or QOS.
9174 */
9175 extern uint64_t sysctl_get_quantum_us(void);
9176 uint64_t
sysctl_get_quantum_us(void)9177 sysctl_get_quantum_us(void)
9178 {
9179 uint32_t quantum;
9180 uint64_t quantum_ns;
9181
9182 quantum = SCHED(initial_quantum_size)(THREAD_NULL);
9183 absolutetime_to_nanoseconds(quantum, &quantum_ns);
9184
9185 return quantum_ns / 1000;
9186 }
9187
9188 #endif /* DEVELOPMENT || DEBUG */
9189