1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_FREE_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67 #include <debug.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80
81 #include <machine/commpage.h>
82
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/debug.h>
90 #include <kern/macro_help.h>
91 #include <kern/machine.h>
92 #include <kern/misc_protos.h>
93 #include <kern/monotonic.h>
94 #include <kern/processor.h>
95 #include <kern/queue.h>
96 #include <kern/recount.h>
97 #include <kern/restartable.h>
98 #include <kern/sched.h>
99 #include <kern/sched_prim.h>
100 #include <kern/sfi.h>
101 #include <kern/syscall_subr.h>
102 #include <kern/task.h>
103 #include <kern/thread.h>
104 #include <kern/thread_group.h>
105 #include <kern/ledger.h>
106 #include <kern/timer_queue.h>
107 #include <kern/waitq.h>
108 #include <kern/policy_internal.h>
109
110 #include <vm/pmap.h>
111 #include <vm/vm_kern.h>
112 #include <vm/vm_map.h>
113 #include <vm/vm_pageout.h>
114
115 #include <mach/sdt.h>
116 #include <mach/mach_host.h>
117 #include <mach/host_info.h>
118
119 #include <sys/kdebug.h>
120 #include <kperf/kperf.h>
121 #include <kern/kpc.h>
122 #include <san/kasan.h>
123 #include <kern/pms.h>
124 #include <kern/host.h>
125 #include <stdatomic.h>
126 #include <os/atomic_private.h>
127
128 #ifdef KDBG_MACOS_RELEASE
129 #define KTRC KDBG_MACOS_RELEASE
130 #else
131 #define KTRC KDBG_RELEASE
132 #endif
133
134 struct sched_statistics PERCPU_DATA(sched_stats);
135 bool sched_stats_active;
136
137 static uint64_t
deadline_add(uint64_t d,uint64_t e)138 deadline_add(uint64_t d, uint64_t e)
139 {
140 uint64_t sum;
141 return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
142 }
143
144 int
rt_runq_count(processor_set_t pset)145 rt_runq_count(processor_set_t pset)
146 {
147 return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
148 }
149
150 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)151 rt_runq_earliest_deadline(processor_set_t pset)
152 {
153 return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
154 }
155
156 static int
rt_runq_priority(processor_set_t pset)157 rt_runq_priority(processor_set_t pset)
158 {
159 pset_assert_locked(pset);
160 rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
161
162 bitmap_t *map = rt_run_queue->bitmap;
163 int i = bitmap_first(map, NRTQS);
164 assert(i < NRTQS);
165
166 if (i >= 0) {
167 return i + BASEPRI_RTQUEUES;
168 }
169
170 return i;
171 }
172
173 static thread_t rt_runq_first(rt_queue_t rt_runq);
174
175 #if DEBUG
176 static void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)177 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
178 {
179 bitmap_t *map = rt_run_queue->bitmap;
180
181 uint64_t earliest_deadline = RT_DEADLINE_NONE;
182 uint32_t constraint = RT_CONSTRAINT_NONE;
183 int ed_index = NOPRI;
184 int count = 0;
185 bool found_thread = false;
186
187 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
188 int i = pri - BASEPRI_RTQUEUES;
189 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
190 queue_t queue = &rt_runq->pri_queue;
191 queue_entry_t iter;
192 int n = 0;
193 uint64_t previous_deadline = 0;
194 qe_foreach(iter, queue) {
195 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
196 assert_thread_magic(iter_thread);
197 if (iter_thread == thread) {
198 found_thread = true;
199 }
200 assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
201 assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
202 assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
203 assert(previous_deadline <= iter_thread->realtime.deadline);
204 n++;
205 if (iter == queue_first(queue)) {
206 assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
207 assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
208 }
209 previous_deadline = iter_thread->realtime.deadline;
210 }
211 assert(n == rt_runq->pri_count);
212 if (n == 0) {
213 assert(bitmap_test(map, i) == false);
214 assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
215 assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
216 } else {
217 assert(bitmap_test(map, i) == true);
218 }
219 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
220 earliest_deadline = rt_runq->pri_earliest_deadline;
221 constraint = rt_runq->pri_constraint;
222 ed_index = i;
223 }
224 count += n;
225 }
226 assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
227 assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
228 assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
229 assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
230 if (thread) {
231 assert(found_thread);
232 }
233 }
234 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th)
235 #else
236 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0)
237 #endif
238
239 uint32_t rt_constraint_threshold;
240
241 static bool
rt_runq_is_low_latency(processor_set_t pset)242 rt_runq_is_low_latency(processor_set_t pset)
243 {
244 return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
245 }
246
247 TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true);
248
249 /* TODO: enable this, to 50us (less than the deferred IPI latency, to beat a spill) */
250 TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 0); /* microseconds */
251 static uint64_t nonurgent_preemption_timer_abs = 0;
252
253 #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
254 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
255
256 #define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
257 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
258
259 #define MAX_UNSAFE_RT_QUANTA 100
260 #define SAFE_RT_MULTIPLIER 2
261
262 #define MAX_UNSAFE_FIXED_QUANTA 100
263 #define SAFE_FIXED_MULTIPLIER 2
264
265 TUNABLE_DEV_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
266 TUNABLE_DEV_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
267
268 TUNABLE_DEV_WRITEABLE(int, safe_rt_multiplier, "safe_rt_multiplier", SAFE_RT_MULTIPLIER);
269 TUNABLE_DEV_WRITEABLE(int, safe_fixed_multiplier, "safe_fixed_multiplier", SAFE_RT_MULTIPLIER);
270
271 #define MAX_POLL_QUANTA 2
272 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
273
274 #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
275 int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
276
277 uint64_t max_poll_computation;
278
279 uint64_t max_unsafe_rt_computation;
280 uint64_t max_unsafe_fixed_computation;
281 uint64_t sched_safe_rt_duration;
282 uint64_t sched_safe_fixed_duration;
283
284 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
285
286 uint32_t std_quantum;
287 uint32_t min_std_quantum;
288 uint32_t bg_quantum;
289
290 uint32_t std_quantum_us;
291 uint32_t bg_quantum_us;
292
293 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
294
295 uint32_t thread_depress_time;
296 uint32_t default_timeshare_computation;
297 uint32_t default_timeshare_constraint;
298
299 uint32_t max_rt_quantum;
300 uint32_t min_rt_quantum;
301
302 uint32_t rt_deadline_epsilon;
303
304 uint32_t rt_constraint_threshold;
305
306 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
307
308 unsigned sched_tick;
309 uint32_t sched_tick_interval;
310
311 /* Timeshare load calculation interval (15ms) */
312 uint32_t sched_load_compute_interval_us = 15000;
313 uint64_t sched_load_compute_interval_abs;
314 static _Atomic uint64_t sched_load_compute_deadline;
315
316 uint32_t sched_pri_shifts[TH_BUCKET_MAX];
317 uint32_t sched_fixed_shift;
318
319 uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
320
321 /* Allow foreground to decay past default to resolve inversions */
322 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
323 int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
324
325 /* Defaults for timer deadline profiling */
326 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
327 * 2ms */
328 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
329 * <= 5ms */
330
331 uint64_t timer_deadline_tracking_bin_1;
332 uint64_t timer_deadline_tracking_bin_2;
333
334 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
335
336 thread_t sched_maintenance_thread;
337
338 /* interrupts disabled lock to guard recommended cores state */
339 decl_simple_lock_data(, sched_available_cores_lock);
340 uint64_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
341 uint64_t perfcontrol_system_requested_recommended_cores = ALL_CORES_RECOMMENDED;
342 uint64_t perfcontrol_user_requested_recommended_cores = ALL_CORES_RECOMMENDED;
343 static uint64_t usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
344 static uint64_t sched_online_processors = 0;
345 static void sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags);
346 static void sched_update_powered_cores(uint64_t reqested_powered_cores, processor_reason_t reason, uint32_t flags);
347
348 #if __arm64__
349 static void sched_recommended_cores_maintenance(void);
350 uint64_t perfcontrol_failsafe_starvation_threshold;
351 extern char *proc_name_address(struct proc *p);
352 #endif /* __arm64__ */
353
354 uint64_t sched_one_second_interval;
355 boolean_t allow_direct_handoff = TRUE;
356
357 /* Forwards */
358
359 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
360
361 static void load_shift_init(void);
362 static void preempt_pri_init(void);
363
364 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
365
366 thread_t processor_idle(
367 thread_t thread,
368 processor_t processor);
369
370 static ast_t
371 csw_check_locked(
372 thread_t thread,
373 processor_t processor,
374 processor_set_t pset,
375 ast_t check_reason);
376
377 static void processor_setrun(
378 processor_t processor,
379 thread_t thread,
380 integer_t options);
381
382 static void
383 sched_realtime_timebase_init(void);
384
385 static void
386 sched_timer_deadline_tracking_init(void);
387
388 #if DEBUG
389 extern int debug_task;
390 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
391 #else
392 #define TLOG(a, fmt, args...) do {} while (0)
393 #endif
394
395 static processor_t
396 thread_bind_internal(
397 thread_t thread,
398 processor_t processor);
399
400 static void
401 sched_vm_group_maintenance(void);
402
403 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
404 int8_t sched_load_shifts[NRQS];
405 bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
406 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
407
408 /*
409 * Statically allocate a buffer to hold the longest possible
410 * scheduler description string, as currently implemented.
411 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
412 * to export to userspace via sysctl(3). If either version
413 * changes, update the other.
414 *
415 * Note that in addition to being an upper bound on the strings
416 * in the kernel, it's also an exact parameter to PE_get_default(),
417 * which interrogates the device tree on some platforms. That
418 * API requires the caller know the exact size of the device tree
419 * property, so we need both a legacy size (32) and the current size
420 * (48) to deal with old and new device trees. The device tree property
421 * is similarly padded to a fixed size so that the same kernel image
422 * can run on multiple devices with different schedulers configured
423 * in the device tree.
424 */
425 char sched_string[SCHED_STRING_MAX_LENGTH];
426
427 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
428
429 /* Global flag which indicates whether Background Stepper Context is enabled */
430 static int cpu_throttle_enabled = 1;
431
432 #if DEVELOPMENT || DEBUG
433 int enable_task_set_cluster_type = 0;
434 bool system_ecore_only = false;
435 #endif /* DEVELOPMENT || DEBUG */
436
437 void
sched_init(void)438 sched_init(void)
439 {
440 boolean_t direct_handoff = FALSE;
441 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
442
443 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
444 /* No boot-args, check in device tree */
445 if (!PE_get_default("kern.sched_pri_decay_limit",
446 &sched_pri_decay_band_limit,
447 sizeof(sched_pri_decay_band_limit))) {
448 /* Allow decay all the way to normal limits */
449 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
450 }
451 }
452
453 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
454
455 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
456 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
457 }
458 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
459
460 #if __arm64__
461 clock_interval_to_absolutetime_interval(expecting_ipi_wfe_timeout_usec, NSEC_PER_USEC, &expecting_ipi_wfe_timeout_mt);
462 #endif /* __arm64__ */
463
464 SCHED(init)();
465 SCHED(rt_init)(&pset0);
466 sched_timer_deadline_tracking_init();
467
468 SCHED(pset_init)(&pset0);
469 SCHED(processor_init)(master_processor);
470
471 if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
472 allow_direct_handoff = direct_handoff;
473 }
474
475 #if DEVELOPMENT || DEBUG
476 if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
477 system_ecore_only = (enable_task_set_cluster_type == 2);
478 }
479 #endif /* DEVELOPMENT || DEBUG */
480
481 simple_lock_init(&sched_available_cores_lock, 0);
482 }
483
484 void
sched_timebase_init(void)485 sched_timebase_init(void)
486 {
487 uint64_t abstime;
488
489 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
490 sched_one_second_interval = abstime;
491
492 SCHED(timebase_init)();
493 sched_realtime_timebase_init();
494 }
495
496 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
497
498 void
sched_timeshare_init(void)499 sched_timeshare_init(void)
500 {
501 /*
502 * Calculate the timeslicing quantum
503 * in us.
504 */
505 if (default_preemption_rate < 1) {
506 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
507 }
508 std_quantum_us = (1000 * 1000) / default_preemption_rate;
509
510 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
511
512 if (default_bg_preemption_rate < 1) {
513 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
514 }
515 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
516
517 printf("standard background quantum is %d us\n", bg_quantum_us);
518
519 load_shift_init();
520 preempt_pri_init();
521 sched_tick = 0;
522 }
523
524 void
sched_set_max_unsafe_rt_quanta(int max)525 sched_set_max_unsafe_rt_quanta(int max)
526 {
527 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
528
529 max_unsafe_rt_computation = ((uint64_t)max) * quantum_size;
530
531 const int mult = safe_rt_multiplier <= 0 ? 2 : safe_rt_multiplier;
532 sched_safe_rt_duration = mult * ((uint64_t)max) * quantum_size;
533
534
535 #if DEVELOPMENT || DEBUG
536 max_unsafe_rt_quanta = max;
537 #else
538 /*
539 * On RELEASE kernels, this is only called on boot where
540 * max is already equal to max_unsafe_rt_quanta.
541 */
542 assert3s(max, ==, max_unsafe_rt_quanta);
543 #endif
544 }
545
546 void
sched_set_max_unsafe_fixed_quanta(int max)547 sched_set_max_unsafe_fixed_quanta(int max)
548 {
549 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
550
551 max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size;
552
553 const int mult = safe_fixed_multiplier <= 0 ? 2 : safe_fixed_multiplier;
554 sched_safe_fixed_duration = mult * ((uint64_t)max) * quantum_size;
555
556 #if DEVELOPMENT || DEBUG
557 max_unsafe_fixed_quanta = max;
558 #else
559 /*
560 * On RELEASE kernels, this is only called on boot where
561 * max is already equal to max_unsafe_fixed_quanta.
562 */
563 assert3s(max, ==, max_unsafe_fixed_quanta);
564 #endif
565 }
566
567 void
sched_timeshare_timebase_init(void)568 sched_timeshare_timebase_init(void)
569 {
570 uint64_t abstime;
571 uint32_t shift;
572
573 /* standard timeslicing quantum */
574 clock_interval_to_absolutetime_interval(
575 std_quantum_us, NSEC_PER_USEC, &abstime);
576 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
577 std_quantum = (uint32_t)abstime;
578
579 /* smallest remaining quantum (250 us) */
580 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
581 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
582 min_std_quantum = (uint32_t)abstime;
583
584 /* quantum for background tasks */
585 clock_interval_to_absolutetime_interval(
586 bg_quantum_us, NSEC_PER_USEC, &abstime);
587 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
588 bg_quantum = (uint32_t)abstime;
589
590 /* scheduler tick interval */
591 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
592 NSEC_PER_USEC, &abstime);
593 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
594 sched_tick_interval = (uint32_t)abstime;
595
596 /* timeshare load calculation interval & deadline initialization */
597 clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
598 os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
599
600 /*
601 * Compute conversion factor from usage to
602 * timesharing priorities with 5/8 ** n aging.
603 */
604 abstime = (abstime * 5) / 3;
605 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
606 abstime >>= 1;
607 }
608 sched_fixed_shift = shift;
609
610 for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
611 sched_pri_shifts[i] = INT8_MAX;
612 }
613
614 sched_set_max_unsafe_rt_quanta(max_unsafe_rt_quanta);
615 sched_set_max_unsafe_fixed_quanta(max_unsafe_fixed_quanta);
616
617 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
618 thread_depress_time = 1 * std_quantum;
619 default_timeshare_computation = std_quantum / 2;
620 default_timeshare_constraint = std_quantum;
621
622 #if __arm64__
623 perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
624 #endif /* __arm64__ */
625
626 if (nonurgent_preemption_timer_us) {
627 clock_interval_to_absolutetime_interval(nonurgent_preemption_timer_us, NSEC_PER_USEC, &abstime);
628 nonurgent_preemption_timer_abs = abstime;
629 }
630 }
631
632 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
633
634 void
pset_rt_init(processor_set_t pset)635 pset_rt_init(processor_set_t pset)
636 {
637 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
638 int i = pri - BASEPRI_RTQUEUES;
639 rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
640 queue_init(&rqi->pri_queue);
641 rqi->pri_count = 0;
642 rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
643 rqi->pri_constraint = RT_CONSTRAINT_NONE;
644 }
645 os_atomic_init(&pset->rt_runq.count, 0);
646 os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
647 os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
648 os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
649 memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
650 }
651
652 /* epsilon for comparing RT deadlines */
653 int rt_deadline_epsilon_us = 100;
654
655 int
sched_get_rt_deadline_epsilon(void)656 sched_get_rt_deadline_epsilon(void)
657 {
658 return rt_deadline_epsilon_us;
659 }
660
661 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)662 sched_set_rt_deadline_epsilon(int new_epsilon_us)
663 {
664 rt_deadline_epsilon_us = new_epsilon_us;
665
666 uint64_t abstime;
667 clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
668 assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
669 rt_deadline_epsilon = (uint32_t)abstime;
670 }
671
672 static void
sched_realtime_timebase_init(void)673 sched_realtime_timebase_init(void)
674 {
675 uint64_t abstime;
676
677 /* smallest rt computation (50 us) */
678 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
679 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
680 min_rt_quantum = (uint32_t)abstime;
681
682 /* maximum rt computation (50 ms) */
683 clock_interval_to_absolutetime_interval(
684 50, 1000 * NSEC_PER_USEC, &abstime);
685 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
686 max_rt_quantum = (uint32_t)abstime;
687
688 /* constraint threshold for sending backup IPIs (4 ms) */
689 clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
690 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
691 rt_constraint_threshold = (uint32_t)abstime;
692
693 /* epsilon for comparing deadlines */
694 sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
695 }
696
697 void
sched_check_spill(processor_set_t pset,thread_t thread)698 sched_check_spill(processor_set_t pset, thread_t thread)
699 {
700 (void)pset;
701 (void)thread;
702
703 return;
704 }
705
706 bool
sched_thread_should_yield(processor_t processor,thread_t thread)707 sched_thread_should_yield(processor_t processor, thread_t thread)
708 {
709 (void)thread;
710
711 return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
712 }
713
714 /* Default implementations of .steal_thread_enabled */
715 bool
sched_steal_thread_DISABLED(processor_set_t pset)716 sched_steal_thread_DISABLED(processor_set_t pset)
717 {
718 (void)pset;
719 return false;
720 }
721
722 bool
sched_steal_thread_enabled(processor_set_t pset)723 sched_steal_thread_enabled(processor_set_t pset)
724 {
725 return bit_count(pset->node->pset_map) > 1;
726 }
727
728 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
729
730 /*
731 * Set up values for timeshare
732 * loading factors.
733 */
734 static void
load_shift_init(void)735 load_shift_init(void)
736 {
737 int8_t k, *p = sched_load_shifts;
738 uint32_t i, j;
739
740 uint32_t sched_decay_penalty = 1;
741
742 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
743 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
744 }
745
746 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
747 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
748 }
749
750 if (sched_decay_penalty == 0) {
751 /*
752 * There is no penalty for timeshare threads for using too much
753 * CPU, so set all load shifts to INT8_MIN. Even under high load,
754 * sched_pri_shift will be >INT8_MAX, and there will be no
755 * penalty applied to threads (nor will sched_usage be updated per
756 * thread).
757 */
758 for (i = 0; i < NRQS; i++) {
759 sched_load_shifts[i] = INT8_MIN;
760 }
761
762 return;
763 }
764
765 *p++ = INT8_MIN; *p++ = 0;
766
767 /*
768 * For a given system load "i", the per-thread priority
769 * penalty per quantum of CPU usage is ~2^k priority
770 * levels. "sched_decay_penalty" can cause more
771 * array entries to be filled with smaller "k" values
772 */
773 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
774 for (j <<= 1; (i < j) && (i < NRQS); ++i) {
775 *p++ = k;
776 }
777 }
778 }
779
780 static void
preempt_pri_init(void)781 preempt_pri_init(void)
782 {
783 bitmap_t *p = sched_preempt_pri;
784
785 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
786 bitmap_set(p, i);
787 }
788
789 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
790 bitmap_set(p, i);
791 }
792 }
793
794 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
795
796 void
check_monotonic_time(uint64_t ctime)797 check_monotonic_time(uint64_t ctime)
798 {
799 processor_t processor = current_processor();
800 uint64_t last_dispatch = processor->last_dispatch;
801
802 if (last_dispatch > ctime) {
803 panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
804 last_dispatch, ctime);
805 }
806 }
807
808
809 /*
810 * Thread wait timer expiration.
811 * Runs in timer interrupt context with interrupts disabled.
812 */
813 void
thread_timer_expire(void * p0,__unused void * p1)814 thread_timer_expire(void *p0, __unused void *p1)
815 {
816 thread_t thread = (thread_t)p0;
817
818 assert_thread_magic(thread);
819
820 assert(ml_get_interrupts_enabled() == FALSE);
821
822 thread_lock(thread);
823
824 if (thread->wait_timer_armed) {
825 thread->wait_timer_armed = false;
826 clear_wait_internal(thread, THREAD_TIMED_OUT);
827 /* clear_wait_internal may have dropped and retaken the thread lock */
828 }
829
830 thread->wait_timer_active--;
831
832 thread_unlock(thread);
833 }
834
835 /*
836 * thread_unblock:
837 *
838 * Unblock thread on wake up.
839 *
840 * Returns TRUE if the thread should now be placed on the runqueue.
841 *
842 * Thread must be locked.
843 *
844 * Called at splsched().
845 */
846 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)847 thread_unblock(
848 thread_t thread,
849 wait_result_t wresult)
850 {
851 boolean_t ready_for_runq = FALSE;
852 thread_t cthread = current_thread();
853 uint32_t new_run_count;
854 int old_thread_state;
855
856 /*
857 * Set wait_result.
858 */
859 thread->wait_result = wresult;
860
861 /*
862 * Cancel pending wait timer.
863 */
864 if (thread->wait_timer_armed) {
865 if (timer_call_cancel(thread->wait_timer)) {
866 thread->wait_timer_active--;
867 }
868 thread->wait_timer_armed = false;
869 }
870
871 boolean_t aticontext, pidle;
872 ml_get_power_state(&aticontext, &pidle);
873
874 /*
875 * Update scheduling state: not waiting,
876 * set running.
877 */
878 old_thread_state = thread->state;
879 thread->state = (old_thread_state | TH_RUN) &
880 ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT | TH_WAKING);
881
882 if ((old_thread_state & TH_RUN) == 0) {
883 uint64_t ctime = mach_approximate_time();
884
885 check_monotonic_time(ctime);
886
887 thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
888 timer_start(&thread->runnable_timer, ctime);
889
890 ready_for_runq = TRUE;
891
892 if (old_thread_state & TH_WAIT_REPORT) {
893 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
894 }
895
896 /* Update the runnable thread count */
897 new_run_count = SCHED(run_count_incr)(thread);
898
899 #if CONFIG_SCHED_AUTO_JOIN
900 if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
901 work_interval_auto_join_propagate(cthread, thread);
902 }
903 #endif /*CONFIG_SCHED_AUTO_JOIN */
904
905 } else {
906 /*
907 * Either the thread is idling in place on another processor,
908 * or it hasn't finished context switching yet.
909 */
910 assert((thread->state & TH_IDLE) == 0);
911 /*
912 * The run count is only dropped after the context switch completes
913 * and the thread is still waiting, so we should not run_incr here
914 */
915 new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
916 }
917
918 /*
919 * Calculate deadline for real-time threads.
920 */
921 if (thread->sched_mode == TH_MODE_REALTIME) {
922 uint64_t ctime = mach_absolute_time();
923 thread->realtime.deadline = thread->realtime.constraint + ctime;
924 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
925 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
926 }
927
928 /*
929 * Clear old quantum, fail-safe computation, etc.
930 */
931 thread->quantum_remaining = 0;
932 thread->computation_metered = 0;
933 thread->reason = AST_NONE;
934 thread->block_hint = kThreadWaitNone;
935
936 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
937 * We also account for "double hop" thread signaling via
938 * the thread callout infrastructure.
939 * DRK: consider removing the callout wakeup counters in the future
940 * they're present for verification at the moment.
941 */
942
943 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
944 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
945
946 uint64_t ttd = current_processor()->timer_call_ttd;
947
948 if (ttd) {
949 if (ttd <= timer_deadline_tracking_bin_1) {
950 thread->thread_timer_wakeups_bin_1++;
951 } else if (ttd <= timer_deadline_tracking_bin_2) {
952 thread->thread_timer_wakeups_bin_2++;
953 }
954 }
955
956 ledger_credit_thread(thread, thread->t_ledger,
957 task_ledgers.interrupt_wakeups, 1);
958 if (pidle) {
959 ledger_credit_thread(thread, thread->t_ledger,
960 task_ledgers.platform_idle_wakeups, 1);
961 }
962 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
963 /* TODO: what about an interrupt that does a wake taken on a callout thread? */
964 if (cthread->callout_woken_from_icontext) {
965 ledger_credit_thread(thread, thread->t_ledger,
966 task_ledgers.interrupt_wakeups, 1);
967 thread->thread_callout_interrupt_wakeups++;
968
969 if (cthread->callout_woken_from_platform_idle) {
970 ledger_credit_thread(thread, thread->t_ledger,
971 task_ledgers.platform_idle_wakeups, 1);
972 thread->thread_callout_platform_idle_wakeups++;
973 }
974
975 cthread->callout_woke_thread = TRUE;
976 }
977 }
978
979 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
980 thread->callout_woken_from_icontext = !!aticontext;
981 thread->callout_woken_from_platform_idle = !!pidle;
982 thread->callout_woke_thread = FALSE;
983 }
984
985 #if KPERF
986 if (ready_for_runq) {
987 kperf_make_runnable(thread, aticontext);
988 }
989 #endif /* KPERF */
990
991 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
992 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
993 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
994 sched_run_buckets[TH_BUCKET_RUN], 0);
995
996 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
997
998 return ready_for_runq;
999 }
1000
1001 /*
1002 * Routine: thread_allowed_for_handoff
1003 * Purpose:
1004 * Check if the thread is allowed for handoff operation
1005 * Conditions:
1006 * thread lock held, IPC locks may be held.
1007 * TODO: In future, do not allow handoff if threads have different cluster
1008 * recommendations.
1009 */
1010 boolean_t
thread_allowed_for_handoff(thread_t thread)1011 thread_allowed_for_handoff(
1012 thread_t thread)
1013 {
1014 thread_t self = current_thread();
1015
1016 if (allow_direct_handoff &&
1017 thread->sched_mode == TH_MODE_REALTIME &&
1018 self->sched_mode == TH_MODE_REALTIME) {
1019 return TRUE;
1020 }
1021
1022 return FALSE;
1023 }
1024
1025 /*
1026 * Routine: thread_go
1027 * Purpose:
1028 * Unblock and dispatch thread.
1029 * Conditions:
1030 * thread lock held, IPC locks may be held.
1031 * thread must have been waiting
1032 */
1033 void
thread_go(thread_t thread,wait_result_t wresult,bool try_handoff)1034 thread_go(
1035 thread_t thread,
1036 wait_result_t wresult,
1037 bool try_handoff)
1038 {
1039 thread_t self = current_thread();
1040
1041 assert_thread_magic(thread);
1042
1043 assert(thread->at_safe_point == FALSE);
1044 assert(thread->wait_event == NO_EVENT64);
1045 assert(waitq_is_null(thread->waitq));
1046
1047 assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1048 assert(thread->state & TH_WAIT);
1049
1050 if (thread->started) {
1051 assert(thread->state & TH_WAKING);
1052 }
1053
1054 thread_lock_assert(thread, LCK_ASSERT_OWNED);
1055
1056 assert(ml_get_interrupts_enabled() == false);
1057
1058 if (thread_unblock(thread, wresult)) {
1059 #if SCHED_TRACE_THREAD_WAKEUPS
1060 backtrace(&thread->thread_wakeup_bt[0],
1061 (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1062 NULL);
1063 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1064 if (try_handoff && thread_allowed_for_handoff(thread)) {
1065 thread_reference(thread);
1066 assert(self->handoff_thread == NULL);
1067 self->handoff_thread = thread;
1068 } else {
1069 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1070 }
1071 }
1072 }
1073
1074 /*
1075 * Routine: thread_mark_wait_locked
1076 * Purpose:
1077 * Mark a thread as waiting. If, given the circumstances,
1078 * it doesn't want to wait (i.e. already aborted), then
1079 * indicate that in the return value.
1080 * Conditions:
1081 * at splsched() and thread is locked.
1082 */
1083 __private_extern__
1084 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1085 thread_mark_wait_locked(
1086 thread_t thread,
1087 wait_interrupt_t interruptible_orig)
1088 {
1089 boolean_t at_safe_point;
1090 wait_interrupt_t interruptible = interruptible_orig;
1091
1092 if (thread->state & TH_IDLE) {
1093 panic("Invalid attempt to wait while running the idle thread");
1094 }
1095
1096 assert(!(thread->state & (TH_WAIT | TH_WAKING | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1097
1098 /*
1099 * The thread may have certain types of interrupts/aborts masked
1100 * off. Even if the wait location says these types of interrupts
1101 * are OK, we have to honor mask settings (outer-scoped code may
1102 * not be able to handle aborts at the moment).
1103 */
1104 interruptible &= TH_OPT_INTMASK;
1105 if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1106 interruptible = thread->options & TH_OPT_INTMASK;
1107 }
1108
1109 at_safe_point = (interruptible == THREAD_ABORTSAFE);
1110
1111 if (interruptible == THREAD_UNINT ||
1112 !(thread->sched_flags & TH_SFLAG_ABORT) ||
1113 (!at_safe_point &&
1114 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1115 if (!(thread->state & TH_TERMINATE)) {
1116 DTRACE_SCHED(sleep);
1117 }
1118
1119 int state_bits = TH_WAIT;
1120 if (!interruptible) {
1121 state_bits |= TH_UNINT;
1122 }
1123 if (thread->sched_call) {
1124 wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1125 if (is_kerneltask(get_threadtask(thread))) {
1126 mask = THREAD_WAIT_NOREPORT_KERNEL;
1127 }
1128 if ((interruptible_orig & mask) == 0) {
1129 state_bits |= TH_WAIT_REPORT;
1130 }
1131 }
1132 thread->state |= state_bits;
1133 thread->at_safe_point = at_safe_point;
1134
1135 /* TODO: pass this through assert_wait instead, have
1136 * assert_wait just take a struct as an argument */
1137 assert(!thread->block_hint);
1138 thread->block_hint = thread->pending_block_hint;
1139 thread->pending_block_hint = kThreadWaitNone;
1140
1141 return thread->wait_result = THREAD_WAITING;
1142 } else {
1143 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1144 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1145 }
1146 }
1147 thread->pending_block_hint = kThreadWaitNone;
1148
1149 return thread->wait_result = THREAD_INTERRUPTED;
1150 }
1151
1152 /*
1153 * Routine: thread_interrupt_level
1154 * Purpose:
1155 * Set the maximum interruptible state for the
1156 * current thread. The effective value of any
1157 * interruptible flag passed into assert_wait
1158 * will never exceed this.
1159 *
1160 * Useful for code that must not be interrupted,
1161 * but which calls code that doesn't know that.
1162 * Returns:
1163 * The old interrupt level for the thread.
1164 */
1165 __private_extern__
1166 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1167 thread_interrupt_level(
1168 wait_interrupt_t new_level)
1169 {
1170 thread_t thread = current_thread();
1171 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1172
1173 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1174
1175 return result;
1176 }
1177
1178 /*
1179 * assert_wait:
1180 *
1181 * Assert that the current thread is about to go to
1182 * sleep until the specified event occurs.
1183 */
1184 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1185 assert_wait(
1186 event_t event,
1187 wait_interrupt_t interruptible)
1188 {
1189 if (__improbable(event == NO_EVENT)) {
1190 panic("%s() called with NO_EVENT", __func__);
1191 }
1192
1193 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1194 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1195 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1196
1197 struct waitq *waitq;
1198 waitq = global_eventq(event);
1199 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1200 }
1201
1202 /*
1203 * assert_wait_queue:
1204 *
1205 * Return the global waitq for the specified event
1206 */
1207 struct waitq *
assert_wait_queue(event_t event)1208 assert_wait_queue(
1209 event_t event)
1210 {
1211 return global_eventq(event);
1212 }
1213
1214 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1215 assert_wait_timeout(
1216 event_t event,
1217 wait_interrupt_t interruptible,
1218 uint32_t interval,
1219 uint32_t scale_factor)
1220 {
1221 thread_t thread = current_thread();
1222 wait_result_t wresult;
1223 uint64_t deadline;
1224 spl_t s;
1225
1226 if (__improbable(event == NO_EVENT)) {
1227 panic("%s() called with NO_EVENT", __func__);
1228 }
1229
1230 struct waitq *waitq;
1231 waitq = global_eventq(event);
1232
1233 s = splsched();
1234 waitq_lock(waitq);
1235
1236 clock_interval_to_deadline(interval, scale_factor, &deadline);
1237
1238 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1239 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1240 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1241
1242 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1243 interruptible,
1244 TIMEOUT_URGENCY_SYS_NORMAL,
1245 deadline, TIMEOUT_NO_LEEWAY,
1246 thread);
1247
1248 waitq_unlock(waitq);
1249 splx(s);
1250 return wresult;
1251 }
1252
1253 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1254 assert_wait_timeout_with_leeway(
1255 event_t event,
1256 wait_interrupt_t interruptible,
1257 wait_timeout_urgency_t urgency,
1258 uint32_t interval,
1259 uint32_t leeway,
1260 uint32_t scale_factor)
1261 {
1262 thread_t thread = current_thread();
1263 wait_result_t wresult;
1264 uint64_t deadline;
1265 uint64_t abstime;
1266 uint64_t slop;
1267 uint64_t now;
1268 spl_t s;
1269
1270 if (__improbable(event == NO_EVENT)) {
1271 panic("%s() called with NO_EVENT", __func__);
1272 }
1273
1274 now = mach_absolute_time();
1275 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1276 deadline = now + abstime;
1277
1278 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1279
1280 struct waitq *waitq;
1281 waitq = global_eventq(event);
1282
1283 s = splsched();
1284 waitq_lock(waitq);
1285
1286 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1287 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1288 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1289
1290 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1291 interruptible,
1292 urgency, deadline, slop,
1293 thread);
1294
1295 waitq_unlock(waitq);
1296 splx(s);
1297 return wresult;
1298 }
1299
1300 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1301 assert_wait_deadline(
1302 event_t event,
1303 wait_interrupt_t interruptible,
1304 uint64_t deadline)
1305 {
1306 thread_t thread = current_thread();
1307 wait_result_t wresult;
1308 spl_t s;
1309
1310 if (__improbable(event == NO_EVENT)) {
1311 panic("%s() called with NO_EVENT", __func__);
1312 }
1313
1314 struct waitq *waitq;
1315 waitq = global_eventq(event);
1316
1317 s = splsched();
1318 waitq_lock(waitq);
1319
1320 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1321 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1322 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1323
1324 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1325 interruptible,
1326 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1327 TIMEOUT_NO_LEEWAY, thread);
1328 waitq_unlock(waitq);
1329 splx(s);
1330 return wresult;
1331 }
1332
1333 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1334 assert_wait_deadline_with_leeway(
1335 event_t event,
1336 wait_interrupt_t interruptible,
1337 wait_timeout_urgency_t urgency,
1338 uint64_t deadline,
1339 uint64_t leeway)
1340 {
1341 thread_t thread = current_thread();
1342 wait_result_t wresult;
1343 spl_t s;
1344
1345 if (__improbable(event == NO_EVENT)) {
1346 panic("%s() called with NO_EVENT", __func__);
1347 }
1348
1349 struct waitq *waitq;
1350 waitq = global_eventq(event);
1351
1352 s = splsched();
1353 waitq_lock(waitq);
1354
1355 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1356 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1357 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1358
1359 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1360 interruptible,
1361 urgency, deadline, leeway,
1362 thread);
1363 waitq_unlock(waitq);
1364 splx(s);
1365 return wresult;
1366 }
1367
1368 void
sched_cond_init(sched_cond_atomic_t * cond)1369 sched_cond_init(
1370 sched_cond_atomic_t *cond)
1371 {
1372 os_atomic_init(cond, SCHED_COND_INIT);
1373 }
1374
1375 wait_result_t
sched_cond_wait_parameter(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation,void * parameter)1376 sched_cond_wait_parameter(
1377 sched_cond_atomic_t *cond,
1378 wait_interrupt_t interruptible,
1379 thread_continue_t continuation,
1380 void *parameter)
1381 {
1382 assert_wait((event_t) cond, interruptible);
1383 /* clear active bit to indicate future wakeups will have to unblock this thread */
1384 sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed);
1385 if (__improbable(new_state & SCHED_COND_WAKEUP)) {
1386 /* a wakeup has been issued; undo wait assertion, ack the wakeup, and return */
1387 thread_t thread = current_thread();
1388 clear_wait(thread, THREAD_AWAKENED);
1389 sched_cond_ack(cond);
1390 return THREAD_AWAKENED;
1391 }
1392 return thread_block_parameter(continuation, parameter);
1393 }
1394
1395 wait_result_t
sched_cond_wait(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation)1396 sched_cond_wait(
1397 sched_cond_atomic_t *cond,
1398 wait_interrupt_t interruptible,
1399 thread_continue_t continuation)
1400 {
1401 return sched_cond_wait_parameter(cond, interruptible, continuation, NULL);
1402 }
1403
1404 sched_cond_t
sched_cond_ack(sched_cond_atomic_t * cond)1405 sched_cond_ack(
1406 sched_cond_atomic_t *cond)
1407 {
1408 sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE | SCHED_COND_WAKEUP, acquire);
1409 assert(new_cond & SCHED_COND_ACTIVE);
1410 return new_cond;
1411 }
1412
1413 kern_return_t
sched_cond_signal(sched_cond_atomic_t * cond,thread_t thread)1414 sched_cond_signal(
1415 sched_cond_atomic_t *cond,
1416 thread_t thread)
1417 {
1418 disable_preemption();
1419 sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release);
1420 if (!(old_cond & (SCHED_COND_WAKEUP | SCHED_COND_ACTIVE))) {
1421 /* this was the first wakeup to be issued AND the thread was inactive */
1422 thread_wakeup_thread((event_t) cond, thread);
1423 }
1424 enable_preemption();
1425 return KERN_SUCCESS;
1426 }
1427
1428 /*
1429 * thread_isoncpu:
1430 *
1431 * Return TRUE if a thread is running on a processor such that an AST
1432 * is needed to pull it out of userspace execution, or if executing in
1433 * the kernel, bring to a context switch boundary that would cause
1434 * thread state to be serialized in the thread PCB.
1435 *
1436 * Thread locked, returns the same way. While locked, fields
1437 * like "state" cannot change. "runq" can change only from set to unset.
1438 */
1439 static inline boolean_t
thread_isoncpu(thread_t thread)1440 thread_isoncpu(thread_t thread)
1441 {
1442 /* Not running or runnable */
1443 if (!(thread->state & TH_RUN)) {
1444 return FALSE;
1445 }
1446
1447 /* Waiting on a runqueue, not currently running */
1448 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1449 if (thread_get_runq(thread) != PROCESSOR_NULL) {
1450 return FALSE;
1451 }
1452
1453 /*
1454 * Thread does not have a stack yet
1455 * It could be on the stack alloc queue or preparing to be invoked
1456 */
1457 if (!thread->kernel_stack) {
1458 return FALSE;
1459 }
1460
1461 /*
1462 * Thread must be running on a processor, or
1463 * about to run, or just did run. In all these
1464 * cases, an AST to the processor is needed
1465 * to guarantee that the thread is kicked out
1466 * of userspace and the processor has
1467 * context switched (and saved register state).
1468 */
1469 return TRUE;
1470 }
1471
1472 /*
1473 * thread_stop:
1474 *
1475 * Force a preemption point for a thread and wait
1476 * for it to stop running on a CPU. If a stronger
1477 * guarantee is requested, wait until no longer
1478 * runnable. Arbitrates access among
1479 * multiple stop requests. (released by unstop)
1480 *
1481 * The thread must enter a wait state and stop via a
1482 * separate means.
1483 *
1484 * Returns FALSE if interrupted.
1485 */
1486 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1487 thread_stop(
1488 thread_t thread,
1489 boolean_t until_not_runnable)
1490 {
1491 wait_result_t wresult;
1492 spl_t s = splsched();
1493 boolean_t oncpu;
1494
1495 wake_lock(thread);
1496 thread_lock(thread);
1497
1498 while (thread->state & TH_SUSP) {
1499 thread->wake_active = TRUE;
1500 thread_unlock(thread);
1501
1502 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1503 wake_unlock(thread);
1504 splx(s);
1505
1506 if (wresult == THREAD_WAITING) {
1507 wresult = thread_block(THREAD_CONTINUE_NULL);
1508 }
1509
1510 if (wresult != THREAD_AWAKENED) {
1511 return FALSE;
1512 }
1513
1514 s = splsched();
1515 wake_lock(thread);
1516 thread_lock(thread);
1517 }
1518
1519 thread->state |= TH_SUSP;
1520
1521 while ((oncpu = thread_isoncpu(thread)) ||
1522 (until_not_runnable && (thread->state & TH_RUN))) {
1523 processor_t processor;
1524
1525 if (oncpu) {
1526 assert(thread->state & TH_RUN);
1527 processor = thread->chosen_processor;
1528 cause_ast_check(processor);
1529 }
1530
1531 thread->wake_active = TRUE;
1532 thread_unlock(thread);
1533
1534 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1535 wake_unlock(thread);
1536 splx(s);
1537
1538 if (wresult == THREAD_WAITING) {
1539 wresult = thread_block(THREAD_CONTINUE_NULL);
1540 }
1541
1542 if (wresult != THREAD_AWAKENED) {
1543 thread_unstop(thread);
1544 return FALSE;
1545 }
1546
1547 s = splsched();
1548 wake_lock(thread);
1549 thread_lock(thread);
1550 }
1551
1552 thread_unlock(thread);
1553 wake_unlock(thread);
1554 splx(s);
1555
1556 /*
1557 * We return with the thread unlocked. To prevent it from
1558 * transitioning to a runnable state (or from TH_RUN to
1559 * being on the CPU), the caller must ensure the thread
1560 * is stopped via an external means (such as an AST)
1561 */
1562
1563 return TRUE;
1564 }
1565
1566 /*
1567 * thread_unstop:
1568 *
1569 * Release a previous stop request and set
1570 * the thread running if appropriate.
1571 *
1572 * Use only after a successful stop operation.
1573 */
1574 void
thread_unstop(thread_t thread)1575 thread_unstop(
1576 thread_t thread)
1577 {
1578 spl_t s = splsched();
1579
1580 wake_lock(thread);
1581 thread_lock(thread);
1582
1583 assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1584
1585 if (thread->state & TH_SUSP) {
1586 thread->state &= ~TH_SUSP;
1587
1588 if (thread->wake_active) {
1589 thread->wake_active = FALSE;
1590 thread_unlock(thread);
1591
1592 thread_wakeup(&thread->wake_active);
1593 wake_unlock(thread);
1594 splx(s);
1595
1596 return;
1597 }
1598 }
1599
1600 thread_unlock(thread);
1601 wake_unlock(thread);
1602 splx(s);
1603 }
1604
1605 /*
1606 * thread_wait:
1607 *
1608 * Wait for a thread to stop running. (non-interruptible)
1609 *
1610 */
1611 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1612 thread_wait(
1613 thread_t thread,
1614 boolean_t until_not_runnable)
1615 {
1616 wait_result_t wresult;
1617 boolean_t oncpu;
1618 processor_t processor;
1619 spl_t s = splsched();
1620
1621 wake_lock(thread);
1622 thread_lock(thread);
1623
1624 /*
1625 * Wait until not running on a CPU. If stronger requirement
1626 * desired, wait until not runnable. Assumption: if thread is
1627 * on CPU, then TH_RUN is set, so we're not waiting in any case
1628 * where the original, pure "TH_RUN" check would have let us
1629 * finish.
1630 */
1631 while ((oncpu = thread_isoncpu(thread)) ||
1632 (until_not_runnable && (thread->state & TH_RUN))) {
1633 if (oncpu) {
1634 assert(thread->state & TH_RUN);
1635 processor = thread->chosen_processor;
1636 cause_ast_check(processor);
1637 }
1638
1639 thread->wake_active = TRUE;
1640 thread_unlock(thread);
1641
1642 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1643 wake_unlock(thread);
1644 splx(s);
1645
1646 if (wresult == THREAD_WAITING) {
1647 thread_block(THREAD_CONTINUE_NULL);
1648 }
1649
1650 s = splsched();
1651 wake_lock(thread);
1652 thread_lock(thread);
1653 }
1654
1655 thread_unlock(thread);
1656 wake_unlock(thread);
1657 splx(s);
1658 }
1659
1660 /*
1661 * Routine: clear_wait_internal
1662 *
1663 * Clear the wait condition for the specified thread.
1664 * Start the thread executing if that is appropriate.
1665 * Arguments:
1666 * thread thread to awaken
1667 * result Wakeup result the thread should see
1668 * Conditions:
1669 * At splsched
1670 * the thread is locked.
1671 * Returns:
1672 * KERN_SUCCESS thread was rousted out a wait
1673 * KERN_FAILURE thread was waiting but could not be rousted
1674 * KERN_NOT_WAITING thread was not waiting
1675 */
1676 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1677 clear_wait_internal(
1678 thread_t thread,
1679 wait_result_t wresult)
1680 {
1681 waitq_t waitq = thread->waitq;
1682
1683 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1684 return KERN_FAILURE;
1685 }
1686
1687 /*
1688 * Check that the thread is waiting and not waking, as a waking thread
1689 * has already cleared its waitq, and is destined to be go'ed, don't
1690 * need to do it again.
1691 */
1692 if ((thread->state & (TH_WAIT | TH_TERMINATE | TH_WAKING)) != TH_WAIT) {
1693 assert(waitq_is_null(thread->waitq));
1694 return KERN_NOT_WAITING;
1695 }
1696
1697 /* may drop and retake the thread lock */
1698 if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1699 return KERN_NOT_WAITING;
1700 }
1701
1702 thread_go(thread, wresult, /* handoff */ false);
1703
1704 return KERN_SUCCESS;
1705 }
1706
1707
1708 /*
1709 * clear_wait:
1710 *
1711 * Clear the wait condition for the specified thread. Start the thread
1712 * executing if that is appropriate.
1713 *
1714 * parameters:
1715 * thread thread to awaken
1716 * result Wakeup result the thread should see
1717 */
1718 kern_return_t
clear_wait(thread_t thread,wait_result_t result)1719 clear_wait(
1720 thread_t thread,
1721 wait_result_t result)
1722 {
1723 kern_return_t ret;
1724 spl_t s;
1725
1726 s = splsched();
1727 thread_lock(thread);
1728
1729 ret = clear_wait_internal(thread, result);
1730
1731 if (thread == current_thread()) {
1732 /*
1733 * The thread must be ready to wait again immediately
1734 * after clearing its own wait.
1735 */
1736 assert((thread->state & TH_WAKING) == 0);
1737 }
1738
1739 thread_unlock(thread);
1740 splx(s);
1741 return ret;
1742 }
1743
1744
1745 /*
1746 * thread_wakeup_prim:
1747 *
1748 * Common routine for thread_wakeup, thread_wakeup_with_result,
1749 * and thread_wakeup_one.
1750 *
1751 */
1752 kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1753 thread_wakeup_prim(
1754 event_t event,
1755 boolean_t one_thread,
1756 wait_result_t result)
1757 {
1758 if (__improbable(event == NO_EVENT)) {
1759 panic("%s() called with NO_EVENT", __func__);
1760 }
1761
1762 struct waitq *wq = global_eventq(event);
1763
1764 if (one_thread) {
1765 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1766 } else {
1767 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1768 }
1769 }
1770
1771 /*
1772 * Wakeup a specified thread if and only if it's waiting for this event
1773 */
1774 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1775 thread_wakeup_thread(
1776 event_t event,
1777 thread_t thread)
1778 {
1779 if (__improbable(event == NO_EVENT)) {
1780 panic("%s() called with NO_EVENT", __func__);
1781 }
1782
1783 if (__improbable(thread == THREAD_NULL)) {
1784 panic("%s() called with THREAD_NULL", __func__);
1785 }
1786
1787 struct waitq *wq = global_eventq(event);
1788
1789 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1790 }
1791
1792 /*
1793 * Wakeup a thread waiting on an event and promote it to a priority.
1794 *
1795 * Requires woken thread to un-promote itself when done.
1796 */
1797 kern_return_t
thread_wakeup_one_with_pri(event_t event,int priority)1798 thread_wakeup_one_with_pri(
1799 event_t event,
1800 int priority)
1801 {
1802 if (__improbable(event == NO_EVENT)) {
1803 panic("%s() called with NO_EVENT", __func__);
1804 }
1805
1806 struct waitq *wq = global_eventq(event);
1807
1808 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1809 }
1810
1811 /*
1812 * Wakeup a thread waiting on an event,
1813 * promote it to a priority,
1814 * and return a reference to the woken thread.
1815 *
1816 * Requires woken thread to un-promote itself when done.
1817 */
1818 thread_t
thread_wakeup_identify(event_t event,int priority)1819 thread_wakeup_identify(event_t event,
1820 int priority)
1821 {
1822 if (__improbable(event == NO_EVENT)) {
1823 panic("%s() called with NO_EVENT", __func__);
1824 }
1825
1826 struct waitq *wq = global_eventq(event);
1827
1828 return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1829 }
1830
1831 /*
1832 * thread_bind:
1833 *
1834 * Force the current thread to execute on the specified processor.
1835 * Takes effect after the next thread_block().
1836 *
1837 * Returns the previous binding. PROCESSOR_NULL means
1838 * not bound.
1839 *
1840 * XXX - DO NOT export this to users - XXX
1841 */
1842 processor_t
thread_bind(processor_t processor)1843 thread_bind(
1844 processor_t processor)
1845 {
1846 thread_t self = current_thread();
1847 processor_t prev;
1848 spl_t s;
1849
1850 s = splsched();
1851 thread_lock(self);
1852
1853 prev = thread_bind_internal(self, processor);
1854
1855 thread_unlock(self);
1856 splx(s);
1857
1858 return prev;
1859 }
1860
1861 void
thread_bind_during_wakeup(thread_t thread,processor_t processor)1862 thread_bind_during_wakeup(thread_t thread, processor_t processor)
1863 {
1864 assert(!ml_get_interrupts_enabled());
1865 assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1866 #if MACH_ASSERT
1867 thread_lock_assert(thread, LCK_ASSERT_OWNED);
1868 #endif
1869
1870 if (thread->bound_processor != processor) {
1871 thread_bind_internal(thread, processor);
1872 }
1873 }
1874
1875 void
thread_unbind_after_queue_shutdown(thread_t thread,processor_t processor __assert_only)1876 thread_unbind_after_queue_shutdown(
1877 thread_t thread,
1878 processor_t processor __assert_only)
1879 {
1880 assert(!ml_get_interrupts_enabled());
1881
1882 thread_lock(thread);
1883
1884 if (thread->bound_processor) {
1885 bool removed;
1886
1887 assert(thread->bound_processor == processor);
1888
1889 removed = thread_run_queue_remove(thread);
1890 /*
1891 * we can always unbind even if we didn't really remove the
1892 * thread from the runqueue
1893 */
1894 thread_bind_internal(thread, PROCESSOR_NULL);
1895 if (removed) {
1896 thread_run_queue_reinsert(thread, SCHED_TAILQ);
1897 }
1898 }
1899
1900 thread_unlock(thread);
1901 }
1902
1903 /*
1904 * thread_bind_internal:
1905 *
1906 * If the specified thread is not the current thread, and it is currently
1907 * running on another CPU, a remote AST must be sent to that CPU to cause
1908 * the thread to migrate to its bound processor. Otherwise, the migration
1909 * will occur at the next quantum expiration or blocking point.
1910 *
1911 * When the thread is the current thread, and explicit thread_block() should
1912 * be used to force the current processor to context switch away and
1913 * let the thread migrate to the bound processor.
1914 *
1915 * Thread must be locked, and at splsched.
1916 */
1917
1918 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)1919 thread_bind_internal(
1920 thread_t thread,
1921 processor_t processor)
1922 {
1923 processor_t prev;
1924
1925 /* <rdar://problem/15102234> */
1926 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1927 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1928 thread_assert_runq_null(thread);
1929
1930 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND),
1931 thread_tid(thread), processor ? processor->cpu_id : ~0ul, 0, 0, 0);
1932
1933 prev = thread->bound_processor;
1934 thread->bound_processor = processor;
1935
1936 return prev;
1937 }
1938
1939 /*
1940 * thread_vm_bind_group_add:
1941 *
1942 * The "VM bind group" is a special mechanism to mark a collection
1943 * of threads from the VM subsystem that, in general, should be scheduled
1944 * with only one CPU of parallelism. To accomplish this, we initially
1945 * bind all the threads to the master processor, which has the effect
1946 * that only one of the threads in the group can execute at once, including
1947 * preempting threads in the group that are a lower priority. Future
1948 * mechanisms may use more dynamic mechanisms to prevent the collection
1949 * of VM threads from using more CPU time than desired.
1950 *
1951 * The current implementation can result in priority inversions where
1952 * compute-bound priority 95 or realtime threads that happen to have
1953 * landed on the master processor prevent the VM threads from running.
1954 * When this situation is detected, we unbind the threads for one
1955 * scheduler tick to allow the scheduler to run the threads an
1956 * additional CPUs, before restoring the binding (assuming high latency
1957 * is no longer a problem).
1958 */
1959
1960 /*
1961 * The current max is provisioned for:
1962 * vm_compressor_swap_trigger_thread (92)
1963 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1964 * vm_pageout_continue (92)
1965 * memorystatus_thread (95)
1966 */
1967 #define MAX_VM_BIND_GROUP_COUNT (5)
1968 decl_simple_lock_data(static, sched_vm_group_list_lock);
1969 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1970 static int sched_vm_group_thread_count;
1971 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1972
1973 void
thread_vm_bind_group_add(void)1974 thread_vm_bind_group_add(void)
1975 {
1976 thread_t self = current_thread();
1977
1978 thread_reference(self);
1979 self->options |= TH_OPT_SCHED_VM_GROUP;
1980
1981 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1982 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1983 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1984 simple_unlock(&sched_vm_group_list_lock);
1985
1986 thread_bind(master_processor);
1987
1988 /* Switch to bound processor if not already there */
1989 thread_block(THREAD_CONTINUE_NULL);
1990 }
1991
1992 static void
sched_vm_group_maintenance(void)1993 sched_vm_group_maintenance(void)
1994 {
1995 uint64_t ctime = mach_absolute_time();
1996 uint64_t longtime = ctime - sched_tick_interval;
1997 int i;
1998 spl_t s;
1999 boolean_t high_latency_observed = FALSE;
2000 boolean_t runnable_and_not_on_runq_observed = FALSE;
2001 boolean_t bind_target_changed = FALSE;
2002 processor_t bind_target = PROCESSOR_NULL;
2003
2004 /* Make sure nobody attempts to add new threads while we are enumerating them */
2005 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
2006
2007 s = splsched();
2008
2009 for (i = 0; i < sched_vm_group_thread_count; i++) {
2010 thread_t thread = sched_vm_group_thread_list[i];
2011 assert(thread != THREAD_NULL);
2012 thread_lock(thread);
2013 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
2014 if (thread_get_runq(thread) != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
2015 high_latency_observed = TRUE;
2016 } else if (thread_get_runq(thread) == PROCESSOR_NULL) {
2017 /* There are some cases where a thread be transitiong that also fall into this case */
2018 runnable_and_not_on_runq_observed = TRUE;
2019 }
2020 }
2021 thread_unlock(thread);
2022
2023 if (high_latency_observed && runnable_and_not_on_runq_observed) {
2024 /* All the things we are looking for are true, stop looking */
2025 break;
2026 }
2027 }
2028
2029 splx(s);
2030
2031 if (sched_vm_group_temporarily_unbound) {
2032 /* If we turned off binding, make sure everything is OK before rebinding */
2033 if (!high_latency_observed) {
2034 /* rebind */
2035 bind_target_changed = TRUE;
2036 bind_target = master_processor;
2037 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
2038 }
2039 } else {
2040 /*
2041 * Check if we're in a bad state, which is defined by high
2042 * latency with no core currently executing a thread. If a
2043 * single thread is making progress on a CPU, that means the
2044 * binding concept to reduce parallelism is working as
2045 * designed.
2046 */
2047 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
2048 /* unbind */
2049 bind_target_changed = TRUE;
2050 bind_target = PROCESSOR_NULL;
2051 sched_vm_group_temporarily_unbound = TRUE;
2052 }
2053 }
2054
2055 if (bind_target_changed) {
2056 s = splsched();
2057 for (i = 0; i < sched_vm_group_thread_count; i++) {
2058 thread_t thread = sched_vm_group_thread_list[i];
2059 boolean_t removed;
2060 assert(thread != THREAD_NULL);
2061
2062 thread_lock(thread);
2063 removed = thread_run_queue_remove(thread);
2064 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
2065 thread_bind_internal(thread, bind_target);
2066 } else {
2067 /*
2068 * Thread was in the middle of being context-switched-to,
2069 * or was in the process of blocking. To avoid switching the bind
2070 * state out mid-flight, defer the change if possible.
2071 */
2072 if (bind_target == PROCESSOR_NULL) {
2073 thread_bind_internal(thread, bind_target);
2074 } else {
2075 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
2076 }
2077 }
2078
2079 if (removed) {
2080 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
2081 }
2082 thread_unlock(thread);
2083 }
2084 splx(s);
2085 }
2086
2087 simple_unlock(&sched_vm_group_list_lock);
2088 }
2089
2090 #if defined(__x86_64__)
2091 #define SCHED_AVOID_CPU0 1
2092 #else
2093 #define SCHED_AVOID_CPU0 0
2094 #endif
2095
2096 int sched_allow_rt_smt = 1;
2097 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
2098 int sched_allow_rt_steal = 1;
2099 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
2100
2101 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
2102
2103 int
sched_get_rt_n_backup_processors(void)2104 sched_get_rt_n_backup_processors(void)
2105 {
2106 return sched_rt_n_backup_processors;
2107 }
2108
2109 void
sched_set_rt_n_backup_processors(int n)2110 sched_set_rt_n_backup_processors(int n)
2111 {
2112 if (n < 0) {
2113 n = 0;
2114 } else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
2115 n = SCHED_MAX_BACKUP_PROCESSORS;
2116 }
2117
2118 sched_rt_n_backup_processors = n;
2119 }
2120
2121 int sched_rt_runq_strict_priority = false;
2122
2123 inline static processor_set_t
change_locked_pset(processor_set_t current_pset,processor_set_t new_pset)2124 change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
2125 {
2126 if (current_pset != new_pset) {
2127 pset_unlock(current_pset);
2128 pset_lock(new_pset);
2129 }
2130
2131 return new_pset;
2132 }
2133
2134 /*
2135 * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2136 * rebalancing opportunity exists when a core is (instantaneously) idle, but
2137 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
2138 * IPI thrash if this core does not remain idle following the load balancing ASTs
2139 * Idle "thrash", when IPI issue is followed by idle entry/core power down
2140 * followed by a wakeup shortly thereafter.
2141 */
2142
2143 #if (DEVELOPMENT || DEBUG)
2144 int sched_smt_balance = 1;
2145 #endif
2146
2147 /* Invoked with pset locked, returns with pset unlocked */
2148 bool
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)2149 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2150 {
2151 processor_t ast_processor = NULL;
2152
2153 #if (DEVELOPMENT || DEBUG)
2154 if (__improbable(sched_smt_balance == 0)) {
2155 goto smt_balance_exit;
2156 }
2157 #endif
2158
2159 assert(cprocessor == current_processor());
2160 if (cprocessor->is_SMT == FALSE) {
2161 goto smt_balance_exit;
2162 }
2163
2164 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2165
2166 /* Determine if both this processor and its sibling are idle,
2167 * indicating an SMT rebalancing opportunity.
2168 */
2169 if (sib_processor->state != PROCESSOR_IDLE) {
2170 goto smt_balance_exit;
2171 }
2172
2173 processor_t sprocessor;
2174
2175 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2176 uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2177 ~cpset->primary_map);
2178 for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2179 sprocessor = processor_array[cpuid];
2180 if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2181 (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2182 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2183 if (ipi_type != SCHED_IPI_NONE) {
2184 assert(sprocessor != cprocessor);
2185 ast_processor = sprocessor;
2186 break;
2187 }
2188 }
2189 }
2190
2191 smt_balance_exit:
2192 pset_unlock(cpset);
2193
2194 if (ast_processor) {
2195 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2196 sched_ipi_perform(ast_processor, ipi_type);
2197 }
2198 return false;
2199 }
2200
2201 static cpumap_t
pset_available_cpumap(processor_set_t pset)2202 pset_available_cpumap(processor_set_t pset)
2203 {
2204 return pset->cpu_available_map & pset->recommended_bitmask;
2205 }
2206
2207 int
pset_available_cpu_count(processor_set_t pset)2208 pset_available_cpu_count(processor_set_t pset)
2209 {
2210 return bit_count(pset_available_cpumap(pset));
2211 }
2212
2213 bool
pset_is_recommended(processor_set_t pset)2214 pset_is_recommended(processor_set_t pset)
2215 {
2216 if (!pset) {
2217 return false;
2218 }
2219 return pset_available_cpu_count(pset) > 0;
2220 }
2221
2222 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2223 pset_available_but_not_running_cpumap(processor_set_t pset)
2224 {
2225 return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2226 pset->recommended_bitmask;
2227 }
2228
2229 bool
pset_has_stealable_threads(processor_set_t pset)2230 pset_has_stealable_threads(processor_set_t pset)
2231 {
2232 pset_assert_locked(pset);
2233
2234 cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2235 /*
2236 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2237 * available primary CPUs
2238 */
2239 avail_map &= pset->primary_map;
2240
2241 return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2242 }
2243
2244 static cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)2245 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2246 {
2247 cpumap_t avail_map = pset_available_cpumap(pset);
2248 if (!sched_allow_rt_smt) {
2249 /*
2250 * Secondary CPUs are not allowed to run RT threads, so
2251 * only primary CPUs should be included
2252 */
2253 avail_map &= pset->primary_map;
2254 }
2255
2256 return avail_map & ~pset->realtime_map;
2257 }
2258
2259 static bool
pset_needs_a_followup_IPI(processor_set_t pset)2260 pset_needs_a_followup_IPI(processor_set_t pset)
2261 {
2262 int nbackup_cpus = 0;
2263
2264 if (rt_runq_is_low_latency(pset)) {
2265 nbackup_cpus = sched_rt_n_backup_processors;
2266 }
2267
2268 int rt_rq_count = rt_runq_count(pset);
2269
2270 return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
2271 }
2272
2273 bool
pset_has_stealable_rt_threads(processor_set_t pset)2274 pset_has_stealable_rt_threads(processor_set_t pset)
2275 {
2276 pset_node_t node = pset->node;
2277 if (bit_count(node->pset_map) == 1) {
2278 return false;
2279 }
2280
2281 cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2282
2283 return rt_runq_count(pset) > bit_count(avail_map);
2284 }
2285
2286 static void
pset_update_rt_stealable_state(processor_set_t pset)2287 pset_update_rt_stealable_state(processor_set_t pset)
2288 {
2289 if (pset_has_stealable_rt_threads(pset)) {
2290 pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2291 } else {
2292 pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2293 }
2294 }
2295
2296 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2297 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2298 {
2299 /* Acknowledge any pending IPIs here with pset lock held */
2300 pset_assert_locked(pset);
2301 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2302 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2303 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2304 }
2305 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2306
2307 #if defined(CONFIG_SCHED_DEFERRED_AST)
2308 bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2309 #endif
2310 }
2311
2312 /*
2313 * Called with pset locked, on a processor that is committing to run a new thread
2314 * Will transition an idle or dispatching processor to running as it picks up
2315 * the first new thread from the idle thread.
2316 */
2317 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2318 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2319 {
2320 pset_assert_locked(pset);
2321
2322 if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2323 assert(current_thread() == processor->idle_thread);
2324
2325 /*
2326 * Dispatching processor is now committed to running new_thread,
2327 * so change its state to PROCESSOR_RUNNING.
2328 */
2329 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2330 } else {
2331 assert((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_SHUTDOWN));
2332 }
2333
2334 processor_state_update_from_thread(processor, new_thread, true);
2335
2336 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2337 bit_set(pset->realtime_map, processor->cpu_id);
2338 } else {
2339 bit_clear(pset->realtime_map, processor->cpu_id);
2340 }
2341 pset_update_rt_stealable_state(pset);
2342
2343 pset_node_t node = pset->node;
2344
2345 if (bit_count(node->pset_map) == 1) {
2346 /* Node has only a single pset, so skip node pset map updates */
2347 return;
2348 }
2349
2350 cpumap_t avail_map = pset_available_cpumap(pset);
2351
2352 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2353 if ((avail_map & pset->realtime_map) == avail_map) {
2354 /* No more non-RT CPUs in this pset */
2355 atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2356 }
2357 avail_map &= pset->primary_map;
2358 if ((avail_map & pset->realtime_map) == avail_map) {
2359 /* No more non-RT primary CPUs in this pset */
2360 atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2361 }
2362 } else {
2363 if ((avail_map & pset->realtime_map) != avail_map) {
2364 if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2365 atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2366 }
2367 }
2368 avail_map &= pset->primary_map;
2369 if ((avail_map & pset->realtime_map) != avail_map) {
2370 if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2371 atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2372 }
2373 }
2374 }
2375 }
2376
2377 static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2378 static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2379 processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2380 static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2381 #if defined(__x86_64__)
2382 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2383 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2384 #endif
2385 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2386 static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2387
2388 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2389 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2390 {
2391 pset_map_t pset_map = stealing_pset->node->pset_map;
2392
2393 bit_clear(pset_map, stealing_pset->pset_id);
2394
2395 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2396 processor_set_t nset = pset_array[pset_id];
2397
2398 if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) {
2399 return true;
2400 }
2401 }
2402
2403 return false;
2404 }
2405
2406 /*
2407 * starting_pset must be locked, but returns true if it is unlocked before return
2408 */
2409 static bool
choose_next_rt_processor_for_IPI(processor_set_t starting_pset,processor_t chosen_processor,bool spill_ipi,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)2410 choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2411 processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2412 {
2413 bool starting_pset_is_unlocked = false;
2414 uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
2415 int max_pri = rt_runq_priority(starting_pset);
2416 __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
2417 processor_set_t pset = starting_pset;
2418 processor_t next_rt_processor = PROCESSOR_NULL;
2419 if (spill_ipi) {
2420 processor_set_t nset = next_pset(pset);
2421 assert(nset != starting_pset);
2422 pset = change_locked_pset(pset, nset);
2423 starting_pset_is_unlocked = true;
2424 }
2425 do {
2426 const bool consider_secondaries = true;
2427 next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries);
2428 if (next_rt_processor == PROCESSOR_NULL) {
2429 if (!spill_ipi) {
2430 break;
2431 }
2432 processor_set_t nset = next_pset(pset);
2433 if (nset == starting_pset) {
2434 break;
2435 }
2436 pset = change_locked_pset(pset, nset);
2437 starting_pset_is_unlocked = true;
2438 }
2439 } while (next_rt_processor == PROCESSOR_NULL);
2440 if (next_rt_processor) {
2441 if (pset != starting_pset) {
2442 if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2443 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2444 next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2445 }
2446 }
2447 *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
2448 *result_processor = next_rt_processor;
2449 }
2450 if (pset != starting_pset) {
2451 pset_unlock(pset);
2452 }
2453
2454 return starting_pset_is_unlocked;
2455 }
2456
2457 /*
2458 * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2459 * followup processor - used in thread_select when there are still threads on the run queue and available processors
2460 * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2461 */
2462 typedef enum {
2463 none,
2464 backup,
2465 followup,
2466 spill
2467 } next_processor_type_t;
2468
2469 #undef LOOP_COUNT
2470 #ifdef LOOP_COUNT
2471 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2472 #endif
2473
2474 /*
2475 * thread_select:
2476 *
2477 * Select a new thread for the current processor to execute.
2478 *
2479 * May select the current thread, which must be locked.
2480 */
2481 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2482 thread_select(thread_t thread,
2483 processor_t processor,
2484 ast_t *reason)
2485 {
2486 processor_set_t pset = processor->processor_set;
2487 thread_t new_thread = THREAD_NULL;
2488
2489 assert(processor == current_processor());
2490 assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2491
2492 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2493 0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2494
2495 __kdebug_only int idle_reason = 0;
2496 __kdebug_only int delay_count = 0;
2497
2498 #if defined(__x86_64__)
2499 int timeout_count = sched_backup_cpu_timeout_count;
2500 if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2501 /* Prefer cpu0 as backup */
2502 timeout_count--;
2503 } else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2504 /* Prefer secondary cpu as backup */
2505 timeout_count--;
2506 }
2507 #endif
2508 bool pending_AST_URGENT = false;
2509 bool pending_AST_PREEMPT = false;
2510
2511 #ifdef LOOP_COUNT
2512 int loop_count = -1;
2513 #endif
2514
2515 do {
2516 /*
2517 * Update the priority.
2518 */
2519 if (SCHED(can_update_priority)(thread)) {
2520 SCHED(update_priority)(thread);
2521 }
2522
2523 pset_lock(pset);
2524
2525 restart:
2526 #ifdef LOOP_COUNT
2527 loop_count++;
2528 if (loop_count > max_loop_count[processor->cpu_id]) {
2529 max_loop_count[processor->cpu_id] = loop_count;
2530 if (bit_count(loop_count) == 1) {
2531 kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2532 }
2533 }
2534 #endif
2535 pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2536 pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2537
2538 processor_state_update_from_thread(processor, thread, true);
2539
2540 idle_reason = 0;
2541
2542 processor_t ast_processor = PROCESSOR_NULL;
2543 processor_t next_rt_processor = PROCESSOR_NULL;
2544 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2545 sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2546
2547 assert(processor->state != PROCESSOR_OFF_LINE);
2548
2549 /*
2550 * Bound threads are dispatched to a processor without going through
2551 * choose_processor(), so in those cases we must continue trying to dequeue work
2552 * as we are the only option.
2553 */
2554 if (!SCHED(processor_bound_count)(processor)) {
2555 if (!processor->is_recommended) {
2556 /*
2557 * The performance controller has provided a hint to not dispatch more threads,
2558 */
2559 idle_reason = 1;
2560 goto send_followup_ipi_before_idle;
2561 } else if (rt_runq_count(pset)) {
2562 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2563 /* Give the current RT thread a chance to complete */
2564 ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2565 #if defined(__x86_64__)
2566 /*
2567 * On Intel we want to avoid SMT secondary processors and processor 0
2568 * but allow them to be used as backup processors in case the preferred chosen
2569 * processor is delayed by interrupts or processor stalls. So if it is
2570 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2571 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2572 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2573 * to grab the thread before the (current) backup processor does.
2574 *
2575 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2576 * on DEVELOPMENT || DEBUG kernels. It is also adjusted (see above) depending on whether we want to use
2577 * cpu0 before secondary cpus or not.
2578 */
2579 if (!ok_to_run_realtime_thread) {
2580 if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2581 if (timeout_count-- > 0) {
2582 pset_unlock(pset);
2583 thread_unlock(thread);
2584 delay(10);
2585 delay_count++;
2586 thread_lock(thread);
2587 pset_lock(pset);
2588 goto restart;
2589 }
2590 ok_to_run_realtime_thread = true;
2591 }
2592 }
2593 #endif
2594 if (!ok_to_run_realtime_thread) {
2595 idle_reason = 2;
2596 goto send_followup_ipi_before_idle;
2597 }
2598 } else if (processor->processor_primary != processor) {
2599 /*
2600 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2601 * we should look for work only under the same conditions that choose_processor()
2602 * would have assigned work, which is when all primary processors have been assigned work.
2603 */
2604 if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2605 /* There are idle primaries */
2606 idle_reason = 3;
2607 goto idle;
2608 }
2609 }
2610 }
2611
2612 /*
2613 * Test to see if the current thread should continue
2614 * to run on this processor. Must not be attempting to wait, and not
2615 * bound to a different processor, nor be in the wrong
2616 * processor set, nor be forced to context switch by TH_SUSP.
2617 *
2618 * Note that there are never any RT threads in the regular runqueue.
2619 *
2620 * This code is very insanely tricky.
2621 */
2622
2623 /* i.e. not waiting, not TH_SUSP'ed */
2624 bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2625
2626 /*
2627 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2628 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2629 * <rdar://problem/47907700>
2630 *
2631 * A yielding thread shouldn't be forced to context switch.
2632 */
2633
2634 bool is_yielding = (*reason & AST_YIELD) == AST_YIELD;
2635
2636 bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2637
2638 bool affinity_mismatch = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2639
2640 bool bound_elsewhere = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2641
2642 bool avoid_processor = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, *reason);
2643
2644 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2645
2646 bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2647 if (current_thread_can_keep_running) {
2648 /*
2649 * This thread is eligible to keep running on this processor.
2650 *
2651 * RT threads with un-expired quantum stay on processor,
2652 * unless there's a valid RT thread with an earlier deadline
2653 * and it is still ok_to_run_realtime_thread.
2654 */
2655 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2656 /*
2657 * Pick a new RT thread only if ok_to_run_realtime_thread
2658 * (but the current thread is allowed to complete).
2659 */
2660 if (ok_to_run_realtime_thread) {
2661 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2662 goto pick_new_rt_thread;
2663 }
2664 if (rt_runq_priority(pset) > thread->sched_pri) {
2665 if (sched_rt_runq_strict_priority) {
2666 /* The next RT thread is better, so pick it off the runqueue. */
2667 goto pick_new_rt_thread;
2668 }
2669
2670 /*
2671 * See if the current lower priority thread can continue to run without causing
2672 * the higher priority thread on the runq queue to miss its deadline.
2673 */
2674 thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2675 if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2676 /* The next RT thread is better, so pick it off the runqueue. */
2677 goto pick_new_rt_thread;
2678 }
2679 } else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2680 /* The next RT thread is better, so pick it off the runqueue. */
2681 goto pick_new_rt_thread;
2682 }
2683 if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2684 goto pick_new_rt_thread;
2685 }
2686 }
2687
2688 /* This is still the best RT thread to run. */
2689 processor->deadline = thread->realtime.deadline;
2690
2691 sched_update_pset_load_average(pset, 0);
2692
2693 clear_pending_AST_bits(pset, processor, 1);
2694
2695 next_rt_processor = PROCESSOR_NULL;
2696 next_rt_ipi_type = SCHED_IPI_NONE;
2697
2698 bool pset_unlocked = false;
2699 __kdebug_only next_processor_type_t nptype = none;
2700 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2701 nptype = spill;
2702 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2703 } else if (pset_needs_a_followup_IPI(pset)) {
2704 nptype = followup;
2705 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2706 }
2707 if (!pset_unlocked) {
2708 pset_unlock(pset);
2709 }
2710
2711 if (next_rt_processor) {
2712 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2713 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2714 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2715 }
2716
2717 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2718 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2719 return thread;
2720 }
2721
2722 if ((rt_runq_count(pset) == 0) &&
2723 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2724 /* This thread is still the highest priority runnable (non-idle) thread */
2725 processor->deadline = RT_DEADLINE_NONE;
2726
2727 sched_update_pset_load_average(pset, 0);
2728
2729 clear_pending_AST_bits(pset, processor, 2);
2730
2731 pset_unlock(pset);
2732
2733 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2734 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2735 return thread;
2736 }
2737 } else {
2738 /*
2739 * This processor must context switch.
2740 * If it's due to a rebalance, we should aggressively find this thread a new home.
2741 */
2742 if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) {
2743 *reason |= AST_REBALANCE;
2744 }
2745 }
2746
2747 bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2748 (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2749 (processor->processor_secondary->state == PROCESSOR_IDLE));
2750
2751 /* OK, so we're not going to run the current thread. Look at the RT queue. */
2752 if (ok_to_run_realtime_thread) {
2753 pick_new_rt_thread:
2754 new_thread = sched_rt_choose_thread(pset);
2755 if (new_thread != THREAD_NULL) {
2756 processor->deadline = new_thread->realtime.deadline;
2757 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2758
2759 clear_pending_AST_bits(pset, processor, 3);
2760
2761 if (processor->processor_secondary != NULL) {
2762 processor_t sprocessor = processor->processor_secondary;
2763 if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2764 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2765 ast_processor = sprocessor;
2766 }
2767 }
2768 }
2769 }
2770
2771 send_followup_ipi_before_idle:
2772 /* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2773 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2774 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2775 }
2776 __kdebug_only next_processor_type_t nptype = none;
2777 bool pset_unlocked = false;
2778 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2779 nptype = spill;
2780 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2781 } else if (pset_needs_a_followup_IPI(pset)) {
2782 nptype = followup;
2783 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2784 }
2785
2786 assert(new_thread || !ast_processor);
2787 if (new_thread || next_rt_processor) {
2788 if (!pset_unlocked) {
2789 pset_unlock(pset);
2790 pset_unlocked = true;
2791 }
2792 if (ast_processor == next_rt_processor) {
2793 ast_processor = PROCESSOR_NULL;
2794 ipi_type = SCHED_IPI_NONE;
2795 }
2796
2797 if (ast_processor) {
2798 sched_ipi_perform(ast_processor, ipi_type);
2799 }
2800
2801 if (next_rt_processor) {
2802 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2803 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2804 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2805 }
2806
2807 if (new_thread) {
2808 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2809 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2810 return new_thread;
2811 }
2812 }
2813
2814 if (pset_unlocked) {
2815 pset_lock(pset);
2816 }
2817
2818 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2819 /* Things changed while we dropped the lock */
2820 goto restart;
2821 }
2822
2823 if (processor->is_recommended) {
2824 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2825 if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2826 /* Things changed while we dropped the lock */
2827 goto restart;
2828 }
2829
2830 if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2831 /* secondary can only run realtime thread */
2832 if (idle_reason == 0) {
2833 idle_reason = 4;
2834 }
2835 goto idle;
2836 }
2837 } else if (!SCHED(processor_bound_count)(processor)) {
2838 /* processor not recommended and no bound threads */
2839 if (idle_reason == 0) {
2840 idle_reason = 5;
2841 }
2842 goto idle;
2843 }
2844
2845 processor->deadline = RT_DEADLINE_NONE;
2846
2847 /* No RT threads, so let's look at the regular threads. */
2848 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2849 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2850
2851 clear_pending_AST_bits(pset, processor, 4);
2852
2853 ast_processor = PROCESSOR_NULL;
2854 ipi_type = SCHED_IPI_NONE;
2855
2856 processor_t sprocessor = processor->processor_secondary;
2857 if (sprocessor != NULL) {
2858 if (sprocessor->state == PROCESSOR_RUNNING) {
2859 if (thread_no_smt(new_thread)) {
2860 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2861 ast_processor = sprocessor;
2862 }
2863 } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
2864 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
2865 ast_processor = sprocessor;
2866 }
2867 }
2868 pset_unlock(pset);
2869
2870 if (ast_processor) {
2871 sched_ipi_perform(ast_processor, ipi_type);
2872 }
2873 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2874 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2875 return new_thread;
2876 }
2877
2878 if (processor->must_idle) {
2879 processor->must_idle = false;
2880 *reason |= AST_REBALANCE;
2881 idle_reason = 6;
2882 goto idle;
2883 }
2884
2885 if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2886 /*
2887 * No runnable threads, attempt to steal
2888 * from other processors. Returns with pset lock dropped.
2889 */
2890
2891 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2892 pset_lock(pset);
2893 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2894 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2895 /*
2896 * A realtime thread choose this processor while it was DISPATCHING
2897 * and the pset lock was dropped
2898 */
2899 ast_on(AST_URGENT | AST_PREEMPT);
2900 }
2901
2902 clear_pending_AST_bits(pset, processor, 5);
2903
2904 pset_unlock(pset);
2905
2906 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2907 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2908 return new_thread;
2909 }
2910
2911 /*
2912 * If other threads have appeared, shortcut
2913 * around again.
2914 */
2915 if (SCHED(processor_bound_count)(processor)) {
2916 continue;
2917 }
2918 if (processor->is_recommended) {
2919 if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2920 continue;
2921 }
2922 }
2923
2924 pset_lock(pset);
2925 }
2926
2927 idle:
2928 /* Someone selected this processor while we had dropped the lock */
2929 if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2930 (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2931 goto restart;
2932 }
2933
2934 if ((idle_reason == 0) && current_thread_can_keep_running) {
2935 /* This thread is the only runnable (non-idle) thread */
2936 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2937 processor->deadline = thread->realtime.deadline;
2938 } else {
2939 processor->deadline = RT_DEADLINE_NONE;
2940 }
2941
2942 sched_update_pset_load_average(pset, 0);
2943
2944 clear_pending_AST_bits(pset, processor, 6);
2945
2946 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2947 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2948 pset_unlock(pset);
2949 return thread;
2950 }
2951
2952 /*
2953 * Nothing is runnable, or this processor must be forced idle,
2954 * so set this processor idle if it was running.
2955 */
2956 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2957 pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2958 processor_state_update_idle(processor);
2959 }
2960 pset_update_rt_stealable_state(pset);
2961
2962 clear_pending_AST_bits(pset, processor, 7);
2963
2964 /* Invoked with pset locked, returns with pset unlocked */
2965 processor->next_idle_short = SCHED(processor_balance)(processor, pset);
2966
2967 new_thread = processor->idle_thread;
2968 } while (new_thread == THREAD_NULL);
2969
2970 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2971 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2972 return new_thread;
2973 }
2974
2975 /*
2976 * thread_invoke
2977 *
2978 * Called at splsched with neither thread locked.
2979 *
2980 * Perform a context switch and start executing the new thread.
2981 *
2982 * Returns FALSE when the context switch didn't happen.
2983 * The reference to the new thread is still consumed.
2984 *
2985 * "self" is what is currently running on the processor,
2986 * "thread" is the new thread to context switch to
2987 * (which may be the same thread in some cases)
2988 */
2989 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)2990 thread_invoke(
2991 thread_t self,
2992 thread_t thread,
2993 ast_t reason)
2994 {
2995 if (__improbable(get_preemption_level() != 0)) {
2996 int pl = get_preemption_level();
2997 panic("thread_invoke: preemption_level %d, possible cause: %s",
2998 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2999 "blocking while holding a spinlock, or within interrupt context"));
3000 }
3001
3002 thread_continue_t continuation = self->continuation;
3003 void *parameter = self->parameter;
3004
3005 struct recount_snap snap = { 0 };
3006 recount_snapshot(&snap);
3007 uint64_t ctime = snap.rsn_time_mach;
3008
3009 check_monotonic_time(ctime);
3010
3011 #ifdef CONFIG_MACH_APPROXIMATE_TIME
3012 commpage_update_mach_approximate_time(ctime);
3013 #endif
3014
3015 if (ctime < thread->last_made_runnable_time) {
3016 panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
3017 ctime, thread->last_made_runnable_time);
3018 }
3019
3020 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3021 if (!((thread->state & TH_IDLE) != 0 ||
3022 ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
3023 sched_timeshare_consider_maintenance(ctime, true);
3024 }
3025 #endif
3026
3027 recount_log_switch_thread(&snap);
3028
3029 assert_thread_magic(self);
3030 assert(self == current_thread());
3031 thread_assert_runq_null(self);
3032 assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
3033
3034 thread_lock(thread);
3035
3036 assert_thread_magic(thread);
3037 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
3038 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
3039 thread_assert_runq_null(thread);
3040
3041 /* Update SFI class based on other factors */
3042 thread->sfi_class = sfi_thread_classify(thread);
3043
3044 /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
3045 thread->same_pri_latency = ctime - thread->last_basepri_change_time;
3046 /*
3047 * In case a base_pri update happened between the timestamp and
3048 * taking the thread lock
3049 */
3050 if (ctime <= thread->last_basepri_change_time) {
3051 thread->same_pri_latency = ctime - thread->last_made_runnable_time;
3052 }
3053
3054 /* Allow realtime threads to hang onto a stack. */
3055 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
3056 self->reserved_stack = self->kernel_stack;
3057 }
3058
3059 /* Prepare for spin debugging */
3060 #if SCHED_HYGIENE_DEBUG
3061 ml_spin_debug_clear(thread);
3062 #endif
3063
3064 if (continuation != NULL) {
3065 if (!thread->kernel_stack) {
3066 /*
3067 * If we are using a privileged stack,
3068 * check to see whether we can exchange it with
3069 * that of the other thread.
3070 */
3071 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
3072 goto need_stack;
3073 }
3074
3075 /*
3076 * Context switch by performing a stack handoff.
3077 * Requires both threads to be parked in a continuation.
3078 */
3079 continuation = thread->continuation;
3080 parameter = thread->parameter;
3081
3082 processor_t processor = current_processor();
3083 processor->active_thread = thread;
3084 processor_state_update_from_thread(processor, thread, false);
3085
3086 if (thread->last_processor != processor && thread->last_processor != NULL) {
3087 if (thread->last_processor->processor_set != processor->processor_set) {
3088 thread->ps_switch++;
3089 }
3090 thread->p_switch++;
3091 }
3092 thread->last_processor = processor;
3093 thread->c_switch++;
3094 ast_context(thread);
3095
3096 thread_unlock(thread);
3097
3098 self->reason = reason;
3099
3100 processor->last_dispatch = ctime;
3101 self->last_run_time = ctime;
3102 timer_update(&thread->runnable_timer, ctime);
3103 recount_switch_thread(&snap, self, get_threadtask(self));
3104
3105 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3106 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
3107 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3108
3109 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3110 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3111 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3112 }
3113
3114 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3115
3116 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3117
3118 #if KPERF
3119 kperf_off_cpu(self);
3120 #endif /* KPERF */
3121
3122 /*
3123 * This is where we actually switch thread identity,
3124 * and address space if required. However, register
3125 * state is not switched - this routine leaves the
3126 * stack and register state active on the current CPU.
3127 */
3128 TLOG(1, "thread_invoke: calling stack_handoff\n");
3129 stack_handoff(self, thread);
3130
3131 /* 'self' is now off core */
3132 assert(thread == current_thread_volatile());
3133
3134 DTRACE_SCHED(on__cpu);
3135
3136 #if KPERF
3137 kperf_on_cpu(thread, continuation, NULL);
3138 #endif /* KPERF */
3139
3140 recount_log_switch_thread_on(&snap);
3141
3142 thread_dispatch(self, thread);
3143
3144 #if KASAN
3145 /* Old thread's stack has been moved to the new thread, so explicitly
3146 * unpoison it. */
3147 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3148 #endif
3149
3150 thread->continuation = thread->parameter = NULL;
3151
3152 boolean_t enable_interrupts = TRUE;
3153
3154 /* idle thread needs to stay interrupts-disabled */
3155 if ((thread->state & TH_IDLE)) {
3156 enable_interrupts = FALSE;
3157 }
3158
3159 assert(continuation);
3160 call_continuation(continuation, parameter,
3161 thread->wait_result, enable_interrupts);
3162 /*NOTREACHED*/
3163 } else if (thread == self) {
3164 /* same thread but with continuation */
3165 ast_context(self);
3166
3167 thread_unlock(self);
3168
3169 #if KPERF
3170 kperf_on_cpu(thread, continuation, NULL);
3171 #endif /* KPERF */
3172
3173 recount_log_switch_thread_on(&snap);
3174
3175 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3176 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3177 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3178
3179 #if KASAN
3180 /* stack handoff to self - no thread_dispatch(), so clear the stack
3181 * and free the fakestack directly */
3182 #if KASAN_CLASSIC
3183 kasan_fakestack_drop(self);
3184 kasan_fakestack_gc(self);
3185 #endif /* KASAN_CLASSIC */
3186 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3187 #endif /* KASAN */
3188
3189 self->continuation = self->parameter = NULL;
3190
3191 boolean_t enable_interrupts = TRUE;
3192
3193 /* idle thread needs to stay interrupts-disabled */
3194 if ((self->state & TH_IDLE)) {
3195 enable_interrupts = FALSE;
3196 }
3197
3198 call_continuation(continuation, parameter,
3199 self->wait_result, enable_interrupts);
3200 /*NOTREACHED*/
3201 }
3202 } else {
3203 /*
3204 * Check that the other thread has a stack
3205 */
3206 if (!thread->kernel_stack) {
3207 need_stack:
3208 if (!stack_alloc_try(thread)) {
3209 thread_unlock(thread);
3210 thread_stack_enqueue(thread);
3211 return FALSE;
3212 }
3213 } else if (thread == self) {
3214 ast_context(self);
3215 thread_unlock(self);
3216
3217 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3218 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3219 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3220
3221 return TRUE;
3222 }
3223 }
3224
3225 /*
3226 * Context switch by full context save.
3227 */
3228 processor_t processor = current_processor();
3229 processor->active_thread = thread;
3230 processor_state_update_from_thread(processor, thread, false);
3231
3232 if (thread->last_processor != processor && thread->last_processor != NULL) {
3233 if (thread->last_processor->processor_set != processor->processor_set) {
3234 thread->ps_switch++;
3235 }
3236 thread->p_switch++;
3237 }
3238 thread->last_processor = processor;
3239 thread->c_switch++;
3240 ast_context(thread);
3241
3242 thread_unlock(thread);
3243
3244 self->reason = reason;
3245
3246 processor->last_dispatch = ctime;
3247 self->last_run_time = ctime;
3248 timer_update(&thread->runnable_timer, ctime);
3249 recount_switch_thread(&snap, self, get_threadtask(self));
3250
3251 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3252 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3253 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3254
3255 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3256 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3257 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3258 }
3259
3260 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3261
3262 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3263
3264 #if KPERF
3265 kperf_off_cpu(self);
3266 #endif /* KPERF */
3267
3268 /*
3269 * This is where we actually switch register context,
3270 * and address space if required. We will next run
3271 * as a result of a subsequent context switch.
3272 *
3273 * Once registers are switched and the processor is running "thread",
3274 * the stack variables and non-volatile registers will contain whatever
3275 * was there the last time that thread blocked. No local variables should
3276 * be used after this point, except for the special case of "thread", which
3277 * the platform layer returns as the previous thread running on the processor
3278 * via the function call ABI as a return register, and "self", which may have
3279 * been stored on the stack or a non-volatile register, but a stale idea of
3280 * what was on the CPU is newly-accurate because that thread is again
3281 * running on the CPU.
3282 *
3283 * If one of the threads is using a continuation, thread_continue
3284 * is used to stitch up its context.
3285 *
3286 * If we are invoking a thread which is resuming from a continuation,
3287 * the CPU will invoke thread_continue next.
3288 *
3289 * If the current thread is parking in a continuation, then its state
3290 * won't be saved and the stack will be discarded. When the stack is
3291 * re-allocated, it will be configured to resume from thread_continue.
3292 */
3293
3294 assert(continuation == self->continuation);
3295 thread = machine_switch_context(self, continuation, thread);
3296 assert(self == current_thread_volatile());
3297 TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3298
3299 assert(continuation == NULL && self->continuation == NULL);
3300
3301 DTRACE_SCHED(on__cpu);
3302
3303 #if KPERF
3304 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3305 #endif /* KPERF */
3306
3307 /* Previous snap on the old stack is gone. */
3308 recount_log_switch_thread_on(NULL);
3309
3310 /* We have been resumed and are set to run. */
3311 thread_dispatch(thread, self);
3312
3313 return TRUE;
3314 }
3315
3316 #if defined(CONFIG_SCHED_DEFERRED_AST)
3317 /*
3318 * pset_cancel_deferred_dispatch:
3319 *
3320 * Cancels all ASTs that we can cancel for the given processor set
3321 * if the current processor is running the last runnable thread in the
3322 * system.
3323 *
3324 * This function assumes the current thread is runnable. This must
3325 * be called with the pset unlocked.
3326 */
3327 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3328 pset_cancel_deferred_dispatch(
3329 processor_set_t pset,
3330 processor_t processor)
3331 {
3332 processor_t active_processor = NULL;
3333 uint32_t sampled_sched_run_count;
3334
3335 pset_lock(pset);
3336 sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3337
3338 /*
3339 * If we have emptied the run queue, and our current thread is runnable, we
3340 * should tell any processors that are still DISPATCHING that they will
3341 * probably not have any work to do. In the event that there are no
3342 * pending signals that we can cancel, this is also uninteresting.
3343 *
3344 * In the unlikely event that another thread becomes runnable while we are
3345 * doing this (sched_run_count is atomically updated, not guarded), the
3346 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3347 * in order to dispatch it to a processor in our pset. So, the other
3348 * codepath will wait while we squash all cancelable ASTs, get the pset
3349 * lock, and then dispatch the freshly runnable thread. So this should be
3350 * correct (we won't accidentally have a runnable thread that hasn't been
3351 * dispatched to an idle processor), if not ideal (we may be restarting the
3352 * dispatch process, which could have some overhead).
3353 */
3354
3355 if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3356 uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3357 pset->pending_deferred_AST_cpu_mask &
3358 ~pset->pending_AST_URGENT_cpu_mask);
3359 for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3360 active_processor = processor_array[cpuid];
3361 /*
3362 * If a processor is DISPATCHING, it could be because of
3363 * a cancelable signal.
3364 *
3365 * IF the processor is not our
3366 * current processor (the current processor should not
3367 * be DISPATCHING, so this is a bit paranoid), AND there
3368 * is a cancelable signal pending on the processor, AND
3369 * there is no non-cancelable signal pending (as there is
3370 * no point trying to backtrack on bringing the processor
3371 * up if a signal we cannot cancel is outstanding), THEN
3372 * it should make sense to roll back the processor state
3373 * to the IDLE state.
3374 *
3375 * If the racey nature of this approach (as the signal
3376 * will be arbitrated by hardware, and can fire as we
3377 * roll back state) results in the core responding
3378 * despite being pushed back to the IDLE state, it
3379 * should be no different than if the core took some
3380 * interrupt while IDLE.
3381 */
3382 if (active_processor != processor) {
3383 /*
3384 * Squash all of the processor state back to some
3385 * reasonable facsimile of PROCESSOR_IDLE.
3386 */
3387
3388 processor_state_update_idle(active_processor);
3389 active_processor->deadline = RT_DEADLINE_NONE;
3390 pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3391 bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3392 machine_signal_idle_cancel(active_processor);
3393 }
3394 }
3395 }
3396
3397 pset_unlock(pset);
3398 }
3399 #else
3400 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3401 #endif
3402
3403 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3404 thread_csw_callout(
3405 thread_t old,
3406 thread_t new,
3407 uint64_t timestamp)
3408 {
3409 perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3410 uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3411 machine_switch_perfcontrol_context(event, timestamp, 0,
3412 same_pri_latency, old, new);
3413 }
3414
3415
3416 /*
3417 * thread_dispatch:
3418 *
3419 * Handle threads at context switch. Re-dispatch other thread
3420 * if still running, otherwise update run state and perform
3421 * special actions. Update quantum for other thread and begin
3422 * the quantum for ourselves.
3423 *
3424 * "thread" is the old thread that we have switched away from.
3425 * "self" is the new current thread that we have context switched to
3426 *
3427 * Called at splsched.
3428 *
3429 */
3430 void
thread_dispatch(thread_t thread,thread_t self)3431 thread_dispatch(
3432 thread_t thread,
3433 thread_t self)
3434 {
3435 processor_t processor = self->last_processor;
3436 bool was_idle = false;
3437
3438 assert(processor == current_processor());
3439 assert(self == current_thread_volatile());
3440 assert(thread != self);
3441
3442 if (thread != THREAD_NULL) {
3443 /*
3444 * Do the perfcontrol callout for context switch.
3445 * The reason we do this here is:
3446 * - thread_dispatch() is called from various places that are not
3447 * the direct context switch path for eg. processor shutdown etc.
3448 * So adding the callout here covers all those cases.
3449 * - We want this callout as early as possible to be close
3450 * to the timestamp taken in thread_invoke()
3451 * - We want to avoid holding the thread lock while doing the
3452 * callout
3453 * - We do not want to callout if "thread" is NULL.
3454 */
3455 thread_csw_callout(thread, self, processor->last_dispatch);
3456
3457 #if KASAN
3458 if (thread->continuation != NULL) {
3459 /*
3460 * Thread has a continuation and the normal stack is going away.
3461 * Unpoison the stack and mark all fakestack objects as unused.
3462 */
3463 #if KASAN_CLASSIC
3464 kasan_fakestack_drop(thread);
3465 #endif /* KASAN_CLASSIC */
3466 if (thread->kernel_stack) {
3467 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3468 }
3469 }
3470
3471
3472 #if KASAN_CLASSIC
3473 /*
3474 * Free all unused fakestack objects.
3475 */
3476 kasan_fakestack_gc(thread);
3477 #endif /* KASAN_CLASSIC */
3478 #endif /* KASAN */
3479
3480 /*
3481 * If blocked at a continuation, discard
3482 * the stack.
3483 */
3484 if (thread->continuation != NULL && thread->kernel_stack != 0) {
3485 stack_free(thread);
3486 }
3487
3488 if (thread->state & TH_IDLE) {
3489 was_idle = true;
3490 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3491 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3492 (uintptr_t)thread_tid(thread), 0, thread->state,
3493 sched_run_buckets[TH_BUCKET_RUN], 0);
3494 } else {
3495 int64_t consumed;
3496 int64_t remainder = 0;
3497
3498 if (processor->quantum_end > processor->last_dispatch) {
3499 remainder = processor->quantum_end -
3500 processor->last_dispatch;
3501 }
3502
3503 consumed = thread->quantum_remaining - remainder;
3504
3505 if ((thread->reason & AST_LEDGER) == 0) {
3506 /*
3507 * Bill CPU time to both the task and
3508 * the individual thread.
3509 */
3510 ledger_credit_thread(thread, thread->t_ledger,
3511 task_ledgers.cpu_time, consumed);
3512 ledger_credit_thread(thread, thread->t_threadledger,
3513 thread_ledgers.cpu_time, consumed);
3514 if (thread->t_bankledger) {
3515 ledger_credit_thread(thread, thread->t_bankledger,
3516 bank_ledgers.cpu_time,
3517 (consumed - thread->t_deduct_bank_ledger_time));
3518 }
3519 thread->t_deduct_bank_ledger_time = 0;
3520 if (consumed > 0) {
3521 /*
3522 * This should never be negative, but in traces we are seeing some instances
3523 * of consumed being negative.
3524 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3525 */
3526 sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3527 }
3528 }
3529
3530 /* For the thread that we just context switched away from, figure
3531 * out if we have expired the wq quantum and set the AST if we have
3532 */
3533 if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3534 thread_evaluate_workqueue_quantum_expiry(thread);
3535 }
3536
3537 if (__improbable(thread->rwlock_count != 0)) {
3538 smr_mark_active_trackers_stalled(thread);
3539 }
3540
3541 /*
3542 * Pairs with task_restartable_ranges_synchronize
3543 */
3544 wake_lock(thread);
3545 thread_lock(thread);
3546
3547 /*
3548 * Same as ast_check(), in case we missed the IPI
3549 */
3550 thread_reset_pcs_ack_IPI(thread);
3551
3552 /*
3553 * Apply a priority floor if the thread holds a kernel resource
3554 * or explicitly requested it.
3555 * Do this before checking starting_pri to avoid overpenalizing
3556 * repeated rwlock blockers.
3557 */
3558 if (__improbable(thread->rwlock_count != 0)) {
3559 lck_rw_set_promotion_locked(thread);
3560 }
3561 if (__improbable(thread->priority_floor_count != 0)) {
3562 thread_floor_boost_set_promotion_locked(thread);
3563 }
3564
3565 boolean_t keep_quantum = processor->first_timeslice;
3566
3567 /*
3568 * Treat a thread which has dropped priority since it got on core
3569 * as having expired its quantum.
3570 */
3571 if (processor->starting_pri > thread->sched_pri) {
3572 keep_quantum = FALSE;
3573 }
3574
3575 /* Compute remainder of current quantum. */
3576 if (keep_quantum &&
3577 processor->quantum_end > processor->last_dispatch) {
3578 thread->quantum_remaining = (uint32_t)remainder;
3579 } else {
3580 thread->quantum_remaining = 0;
3581 }
3582
3583 if (thread->sched_mode == TH_MODE_REALTIME) {
3584 /*
3585 * Cancel the deadline if the thread has
3586 * consumed the entire quantum.
3587 */
3588 if (thread->quantum_remaining == 0) {
3589 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3590 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3591 thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3592 }
3593 } else {
3594 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3595 /*
3596 * For non-realtime threads treat a tiny
3597 * remaining quantum as an expired quantum
3598 * but include what's left next time.
3599 */
3600 if (thread->quantum_remaining < min_std_quantum) {
3601 thread->reason |= AST_QUANTUM;
3602 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3603 }
3604 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3605 }
3606
3607 /*
3608 * If we are doing a direct handoff then
3609 * take the remainder of the quantum.
3610 */
3611 if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3612 self->quantum_remaining = thread->quantum_remaining;
3613 thread->reason |= AST_QUANTUM;
3614 thread->quantum_remaining = 0;
3615 } else {
3616 #if defined(CONFIG_SCHED_MULTIQ)
3617 if (SCHED(sched_groups_enabled) &&
3618 thread->sched_group == self->sched_group) {
3619 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3620 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3621 self->reason, (uintptr_t)thread_tid(thread),
3622 self->quantum_remaining, thread->quantum_remaining, 0);
3623
3624 self->quantum_remaining = thread->quantum_remaining;
3625 thread->quantum_remaining = 0;
3626 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
3627 }
3628 #endif /* defined(CONFIG_SCHED_MULTIQ) */
3629 }
3630
3631 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3632
3633 if (!(thread->state & TH_WAIT)) {
3634 /*
3635 * Still runnable.
3636 */
3637 thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3638
3639 machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3640
3641 ast_t reason = thread->reason;
3642 sched_options_t options = SCHED_NONE;
3643
3644 if (reason & AST_REBALANCE) {
3645 options |= SCHED_REBALANCE;
3646 if (reason & AST_QUANTUM) {
3647 /*
3648 * Having gone to the trouble of forcing this thread off a less preferred core,
3649 * we should force the preferable core to reschedule immediately to give this
3650 * thread a chance to run instead of just sitting on the run queue where
3651 * it may just be stolen back by the idle core we just forced it off.
3652 * But only do this at the end of a quantum to prevent cascading effects.
3653 */
3654 options |= SCHED_PREEMPT;
3655 }
3656 }
3657
3658 if (reason & AST_QUANTUM) {
3659 options |= SCHED_TAILQ;
3660 } else if (reason & AST_PREEMPT) {
3661 options |= SCHED_HEADQ;
3662 } else {
3663 options |= (SCHED_PREEMPT | SCHED_TAILQ);
3664 }
3665
3666 thread_setrun(thread, options);
3667
3668 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3669 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3670 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3671 sched_run_buckets[TH_BUCKET_RUN], 0);
3672
3673 if (thread->wake_active) {
3674 thread->wake_active = FALSE;
3675 thread_unlock(thread);
3676
3677 thread_wakeup(&thread->wake_active);
3678 } else {
3679 thread_unlock(thread);
3680 }
3681
3682 wake_unlock(thread);
3683 } else {
3684 /*
3685 * Waiting.
3686 */
3687 boolean_t should_terminate = FALSE;
3688 uint32_t new_run_count;
3689 int thread_state = thread->state;
3690
3691 /* Only the first call to thread_dispatch
3692 * after explicit termination should add
3693 * the thread to the termination queue
3694 */
3695 if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3696 should_terminate = TRUE;
3697 thread_state |= TH_TERMINATE2;
3698 }
3699
3700 timer_stop(&thread->runnable_timer, processor->last_dispatch);
3701
3702 thread_state &= ~TH_RUN;
3703 thread->state = thread_state;
3704
3705 thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3706 thread->chosen_processor = PROCESSOR_NULL;
3707
3708 new_run_count = SCHED(run_count_decr)(thread);
3709
3710 #if CONFIG_SCHED_AUTO_JOIN
3711 if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3712 work_interval_auto_join_unwind(thread);
3713 }
3714 #endif /* CONFIG_SCHED_AUTO_JOIN */
3715
3716 #if CONFIG_SCHED_SFI
3717 if (thread->reason & AST_SFI) {
3718 thread->wait_sfi_begin_time = processor->last_dispatch;
3719 }
3720 #endif
3721 machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3722
3723 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3724 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3725 (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3726 new_run_count, 0);
3727
3728 if (thread_state & TH_WAIT_REPORT) {
3729 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3730 }
3731
3732 if (thread->wake_active) {
3733 thread->wake_active = FALSE;
3734 thread_unlock(thread);
3735
3736 thread_wakeup(&thread->wake_active);
3737 } else {
3738 thread_unlock(thread);
3739 }
3740
3741 wake_unlock(thread);
3742
3743 if (should_terminate) {
3744 thread_terminate_enqueue(thread);
3745 }
3746 }
3747 }
3748 /*
3749 * The thread could have been added to the termination queue, so it's
3750 * unsafe to use after this point.
3751 */
3752 thread = THREAD_NULL;
3753 }
3754
3755 int urgency = THREAD_URGENCY_NONE;
3756 uint64_t latency = 0;
3757
3758 /* Update (new) current thread and reprogram running timers */
3759 thread_lock(self);
3760
3761 if (!(self->state & TH_IDLE)) {
3762 uint64_t arg1, arg2;
3763
3764 #if CONFIG_SCHED_SFI
3765 ast_t new_ast;
3766
3767 new_ast = sfi_thread_needs_ast(self, NULL);
3768
3769 if (new_ast != AST_NONE) {
3770 ast_on(new_ast);
3771 }
3772 #endif
3773
3774 if (processor->last_dispatch < self->last_made_runnable_time) {
3775 panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3776 processor->last_dispatch, self->last_made_runnable_time);
3777 }
3778
3779 assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3780
3781 latency = processor->last_dispatch - self->last_made_runnable_time;
3782 assert(latency >= self->same_pri_latency);
3783
3784 urgency = thread_get_urgency(self, &arg1, &arg2);
3785
3786 thread_tell_urgency(urgency, arg1, arg2, latency, self);
3787
3788 /*
3789 * Start a new CPU limit interval if the previous one has
3790 * expired. This should happen before initializing a new
3791 * quantum.
3792 */
3793 if (cpulimit_affects_quantum &&
3794 thread_cpulimit_interval_has_expired(processor->last_dispatch)) {
3795 thread_cpulimit_restart(processor->last_dispatch);
3796 }
3797
3798 /*
3799 * Get a new quantum if none remaining.
3800 */
3801 if (self->quantum_remaining == 0) {
3802 thread_quantum_init(self, processor->last_dispatch);
3803 }
3804
3805 /*
3806 * Set up quantum timer and timeslice.
3807 */
3808 processor->quantum_end = processor->last_dispatch +
3809 self->quantum_remaining;
3810
3811 running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
3812 processor->quantum_end, processor->last_dispatch);
3813 if (was_idle) {
3814 /*
3815 * kperf's running timer is active whenever the idle thread for a
3816 * CPU is not running.
3817 */
3818 kperf_running_setup(processor, processor->last_dispatch);
3819 }
3820 running_timers_activate(processor);
3821 processor->first_timeslice = TRUE;
3822 } else {
3823 running_timers_deactivate(processor);
3824 processor->first_timeslice = FALSE;
3825 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
3826 }
3827
3828 assert(self->block_hint == kThreadWaitNone);
3829 self->computation_epoch = processor->last_dispatch;
3830 /*
3831 * This relies on the interrupt time being tallied up to the thread in the
3832 * exception handler epilogue, which is before AST context where preemption
3833 * is considered (and the scheduler is potentially invoked to
3834 * context switch, here).
3835 */
3836 self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach();
3837 self->reason = AST_NONE;
3838 processor->starting_pri = self->sched_pri;
3839
3840 thread_unlock(self);
3841
3842 machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
3843 processor->last_dispatch);
3844
3845 #if defined(CONFIG_SCHED_DEFERRED_AST)
3846 /*
3847 * TODO: Can we state that redispatching our old thread is also
3848 * uninteresting?
3849 */
3850 if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3851 pset_cancel_deferred_dispatch(processor->processor_set, processor);
3852 }
3853 #endif
3854 }
3855
3856 /*
3857 * thread_block_reason:
3858 *
3859 * Forces a reschedule, blocking the caller if a wait
3860 * has been asserted.
3861 *
3862 * If a continuation is specified, then thread_invoke will
3863 * attempt to discard the thread's kernel stack. When the
3864 * thread resumes, it will execute the continuation function
3865 * on a new kernel stack.
3866 */
3867 wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)3868 thread_block_reason(
3869 thread_continue_t continuation,
3870 void *parameter,
3871 ast_t reason)
3872 {
3873 thread_t self = current_thread();
3874 processor_t processor;
3875 thread_t new_thread;
3876 spl_t s;
3877
3878 s = splsched();
3879
3880 processor = current_processor();
3881
3882 /* If we're explicitly yielding, force a subsequent quantum */
3883 if (reason & AST_YIELD) {
3884 processor->first_timeslice = FALSE;
3885 }
3886
3887 /* We're handling all scheduling AST's */
3888 ast_off(AST_SCHEDULING);
3889
3890 clear_pending_nonurgent_preemption(processor);
3891
3892 #if PROC_REF_DEBUG
3893 if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3894 uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3895 }
3896 #endif
3897
3898 self->continuation = continuation;
3899 self->parameter = parameter;
3900
3901 if (self->state & ~(TH_RUN | TH_IDLE)) {
3902 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3903 MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3904 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3905 }
3906
3907 do {
3908 thread_lock(self);
3909 new_thread = thread_select(self, processor, &reason);
3910 thread_unlock(self);
3911 } while (!thread_invoke(self, new_thread, reason));
3912
3913 splx(s);
3914
3915 return self->wait_result;
3916 }
3917
3918 /*
3919 * thread_block:
3920 *
3921 * Block the current thread if a wait has been asserted.
3922 */
3923 wait_result_t
thread_block(thread_continue_t continuation)3924 thread_block(
3925 thread_continue_t continuation)
3926 {
3927 return thread_block_reason(continuation, NULL, AST_NONE);
3928 }
3929
3930 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)3931 thread_block_parameter(
3932 thread_continue_t continuation,
3933 void *parameter)
3934 {
3935 return thread_block_reason(continuation, parameter, AST_NONE);
3936 }
3937
3938 /*
3939 * thread_run:
3940 *
3941 * Switch directly from the current thread to the
3942 * new thread, handing off our quantum if appropriate.
3943 *
3944 * New thread must be runnable, and not on a run queue.
3945 *
3946 * Called at splsched.
3947 */
3948 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)3949 thread_run(
3950 thread_t self,
3951 thread_continue_t continuation,
3952 void *parameter,
3953 thread_t new_thread)
3954 {
3955 ast_t reason = AST_NONE;
3956
3957 if ((self->state & TH_IDLE) == 0) {
3958 reason = AST_HANDOFF;
3959 }
3960
3961 /*
3962 * If this thread hadn't been setrun'ed, it
3963 * might not have a chosen processor, so give it one
3964 */
3965 if (new_thread->chosen_processor == NULL) {
3966 new_thread->chosen_processor = current_processor();
3967 }
3968
3969 self->continuation = continuation;
3970 self->parameter = parameter;
3971
3972 while (!thread_invoke(self, new_thread, reason)) {
3973 /* the handoff failed, so we have to fall back to the normal block path */
3974 processor_t processor = current_processor();
3975
3976 reason = AST_NONE;
3977
3978 thread_lock(self);
3979 new_thread = thread_select(self, processor, &reason);
3980 thread_unlock(self);
3981 }
3982
3983 return self->wait_result;
3984 }
3985
3986 /*
3987 * thread_continue:
3988 *
3989 * Called at splsched when a thread first receives
3990 * a new stack after a continuation.
3991 *
3992 * Called with THREAD_NULL as the old thread when
3993 * invoked by machine_load_context.
3994 */
3995 void
thread_continue(thread_t thread)3996 thread_continue(
3997 thread_t thread)
3998 {
3999 thread_t self = current_thread();
4000 thread_continue_t continuation;
4001 void *parameter;
4002
4003 DTRACE_SCHED(on__cpu);
4004
4005 continuation = self->continuation;
4006 parameter = self->parameter;
4007
4008 assert(continuation != NULL);
4009
4010 #if KPERF
4011 kperf_on_cpu(self, continuation, NULL);
4012 #endif
4013
4014 thread_dispatch(thread, self);
4015
4016 self->continuation = self->parameter = NULL;
4017
4018 #if SCHED_HYGIENE_DEBUG
4019 /* Reset interrupt-masked spin debugging timeout */
4020 ml_spin_debug_clear(self);
4021 #endif
4022
4023 TLOG(1, "thread_continue: calling call_continuation\n");
4024
4025 boolean_t enable_interrupts = TRUE;
4026
4027 /* bootstrap thread, idle thread need to stay interrupts-disabled */
4028 if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
4029 enable_interrupts = FALSE;
4030 }
4031
4032 #if KASAN_TBI
4033 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
4034 #endif /* KASAN_TBI */
4035
4036
4037 call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
4038 /*NOTREACHED*/
4039 }
4040
4041 void
thread_quantum_init(thread_t thread,uint64_t now)4042 thread_quantum_init(thread_t thread, uint64_t now)
4043 {
4044 uint64_t new_quantum = 0;
4045
4046 switch (thread->sched_mode) {
4047 case TH_MODE_REALTIME:
4048 new_quantum = thread->realtime.computation;
4049 new_quantum = MIN(new_quantum, max_unsafe_rt_computation);
4050 break;
4051
4052 case TH_MODE_FIXED:
4053 new_quantum = SCHED(initial_quantum_size)(thread);
4054 new_quantum = MIN(new_quantum, max_unsafe_fixed_computation);
4055 break;
4056
4057 default:
4058 new_quantum = SCHED(initial_quantum_size)(thread);
4059 break;
4060 }
4061
4062 if (cpulimit_affects_quantum) {
4063 const uint64_t cpulimit_remaining = thread_cpulimit_remaining(now);
4064
4065 /*
4066 * If there's no remaining CPU time, the ledger system will
4067 * notice and put the thread to sleep.
4068 */
4069 if (cpulimit_remaining > 0) {
4070 new_quantum = MIN(new_quantum, cpulimit_remaining);
4071 }
4072 }
4073
4074 assert3u(new_quantum, <, UINT32_MAX);
4075 assert3u(new_quantum, >, 0);
4076
4077 thread->quantum_remaining = (uint32_t)new_quantum;
4078 }
4079
4080 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)4081 sched_timeshare_initial_quantum_size(thread_t thread)
4082 {
4083 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
4084 return bg_quantum;
4085 } else {
4086 return std_quantum;
4087 }
4088 }
4089
4090 /*
4091 * run_queue_init:
4092 *
4093 * Initialize a run queue before first use.
4094 */
4095 void
run_queue_init(run_queue_t rq)4096 run_queue_init(
4097 run_queue_t rq)
4098 {
4099 rq->highq = NOPRI;
4100 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
4101 rq->bitmap[i] = 0;
4102 }
4103 rq->urgency = rq->count = 0;
4104 for (int i = 0; i < NRQS; i++) {
4105 circle_queue_init(&rq->queues[i]);
4106 }
4107 }
4108
4109 /*
4110 * run_queue_dequeue:
4111 *
4112 * Perform a dequeue operation on a run queue,
4113 * and return the resulting thread.
4114 *
4115 * The run queue must be locked (see thread_run_queue_remove()
4116 * for more info), and not empty.
4117 */
4118 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)4119 run_queue_dequeue(
4120 run_queue_t rq,
4121 sched_options_t options)
4122 {
4123 thread_t thread;
4124 circle_queue_t queue = &rq->queues[rq->highq];
4125
4126 if (options & SCHED_HEADQ) {
4127 thread = cqe_dequeue_head(queue, struct thread, runq_links);
4128 } else {
4129 thread = cqe_dequeue_tail(queue, struct thread, runq_links);
4130 }
4131
4132 assert(thread != THREAD_NULL);
4133 assert_thread_magic(thread);
4134
4135 thread_clear_runq(thread);
4136 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4137 rq->count--;
4138 if (SCHED(priority_is_urgent)(rq->highq)) {
4139 rq->urgency--; assert(rq->urgency >= 0);
4140 }
4141 if (circle_queue_empty(queue)) {
4142 bitmap_clear(rq->bitmap, rq->highq);
4143 rq->highq = bitmap_first(rq->bitmap, NRQS);
4144 }
4145
4146 return thread;
4147 }
4148
4149 /*
4150 * run_queue_enqueue:
4151 *
4152 * Perform a enqueue operation on a run queue.
4153 *
4154 * The run queue must be locked (see thread_run_queue_remove()
4155 * for more info).
4156 */
4157 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)4158 run_queue_enqueue(
4159 run_queue_t rq,
4160 thread_t thread,
4161 sched_options_t options)
4162 {
4163 circle_queue_t queue = &rq->queues[thread->sched_pri];
4164 boolean_t result = FALSE;
4165
4166 assert_thread_magic(thread);
4167
4168 if (circle_queue_empty(queue)) {
4169 circle_enqueue_tail(queue, &thread->runq_links);
4170
4171 rq_bitmap_set(rq->bitmap, thread->sched_pri);
4172 if (thread->sched_pri > rq->highq) {
4173 rq->highq = thread->sched_pri;
4174 result = TRUE;
4175 }
4176 } else {
4177 if (options & SCHED_TAILQ) {
4178 circle_enqueue_tail(queue, &thread->runq_links);
4179 } else {
4180 circle_enqueue_head(queue, &thread->runq_links);
4181 }
4182 }
4183 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4184 rq->urgency++;
4185 }
4186 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4187 rq->count++;
4188
4189 return result;
4190 }
4191
4192 /*
4193 * run_queue_remove:
4194 *
4195 * Remove a specific thread from a runqueue.
4196 *
4197 * The run queue must be locked.
4198 */
4199 void
run_queue_remove(run_queue_t rq,thread_t thread)4200 run_queue_remove(
4201 run_queue_t rq,
4202 thread_t thread)
4203 {
4204 circle_queue_t queue = &rq->queues[thread->sched_pri];
4205
4206 thread_assert_runq_nonnull(thread);
4207 assert_thread_magic(thread);
4208
4209 circle_dequeue(queue, &thread->runq_links);
4210 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4211 rq->count--;
4212 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4213 rq->urgency--; assert(rq->urgency >= 0);
4214 }
4215
4216 if (circle_queue_empty(queue)) {
4217 /* update run queue status */
4218 bitmap_clear(rq->bitmap, thread->sched_pri);
4219 rq->highq = bitmap_first(rq->bitmap, NRQS);
4220 }
4221
4222 thread_clear_runq(thread);
4223 }
4224
4225 /*
4226 * run_queue_peek
4227 *
4228 * Peek at the runq and return the highest
4229 * priority thread from the runq.
4230 *
4231 * The run queue must be locked.
4232 */
4233 thread_t
run_queue_peek(run_queue_t rq)4234 run_queue_peek(
4235 run_queue_t rq)
4236 {
4237 if (rq->count > 0) {
4238 circle_queue_t queue = &rq->queues[rq->highq];
4239 thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4240 assert_thread_magic(thread);
4241 return thread;
4242 } else {
4243 return THREAD_NULL;
4244 }
4245 }
4246
4247 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)4248 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4249 {
4250 int pri = thread->sched_pri;
4251 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4252 int i = pri - BASEPRI_RTQUEUES;
4253 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4254 bitmap_t *map = rt_run_queue->bitmap;
4255
4256 bitmap_set(map, i);
4257
4258 queue_t queue = &rt_runq->pri_queue;
4259 uint64_t deadline = thread->realtime.deadline;
4260 bool preempt = false;
4261 bool earliest = false;
4262
4263 if (queue_empty(queue)) {
4264 enqueue_tail(queue, &thread->runq_links);
4265 preempt = true;
4266 earliest = true;
4267 rt_runq->pri_earliest_deadline = deadline;
4268 rt_runq->pri_constraint = thread->realtime.constraint;
4269 } else {
4270 /* Insert into rt_runq in thread deadline order */
4271 queue_entry_t iter;
4272 qe_foreach(iter, queue) {
4273 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4274 assert_thread_magic(iter_thread);
4275
4276 if (deadline < iter_thread->realtime.deadline) {
4277 if (iter == queue_first(queue)) {
4278 preempt = true;
4279 earliest = true;
4280 rt_runq->pri_earliest_deadline = deadline;
4281 rt_runq->pri_constraint = thread->realtime.constraint;
4282 }
4283 insque(&thread->runq_links, queue_prev(iter));
4284 break;
4285 } else if (iter == queue_last(queue)) {
4286 enqueue_tail(queue, &thread->runq_links);
4287 break;
4288 }
4289 }
4290 }
4291 if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4292 os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4293 os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4294 os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4295 }
4296
4297 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4298 rt_runq->pri_count++;
4299 os_atomic_inc(&rt_run_queue->count, relaxed);
4300
4301 thread_set_runq_locked(thread, processor);
4302
4303 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4304
4305 return preempt;
4306 }
4307
4308 static thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)4309 rt_runq_dequeue(rt_queue_t rt_run_queue)
4310 {
4311 bitmap_t *map = rt_run_queue->bitmap;
4312 int i = bitmap_first(map, NRTQS);
4313 assert((i >= 0) && (i < NRTQS));
4314
4315 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4316
4317 if (!sched_rt_runq_strict_priority) {
4318 int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4319 if (ed_index != i) {
4320 assert((ed_index >= 0) && (ed_index < NRTQS));
4321 rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4322
4323 thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4324 thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4325
4326 if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4327 /* choose the earliest deadline thread */
4328 rt_runq = ed_runq;
4329 i = ed_index;
4330 }
4331 }
4332 }
4333
4334 assert(rt_runq->pri_count > 0);
4335 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4336 uint32_t constraint = RT_CONSTRAINT_NONE;
4337 int ed_index = NOPRI;
4338 thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4339 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4340 if (--rt_runq->pri_count > 0) {
4341 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4342 assert(next_rt != THREAD_NULL);
4343 earliest_deadline = next_rt->realtime.deadline;
4344 constraint = next_rt->realtime.constraint;
4345 ed_index = i;
4346 } else {
4347 bitmap_clear(map, i);
4348 }
4349 rt_runq->pri_earliest_deadline = earliest_deadline;
4350 rt_runq->pri_constraint = constraint;
4351
4352 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4353 rt_runq = &rt_run_queue->rt_queue_pri[i];
4354 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4355 earliest_deadline = rt_runq->pri_earliest_deadline;
4356 constraint = rt_runq->pri_constraint;
4357 ed_index = i;
4358 }
4359 }
4360 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4361 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4362 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4363 os_atomic_dec(&rt_run_queue->count, relaxed);
4364
4365 thread_clear_runq(new_thread);
4366
4367 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4368
4369 return new_thread;
4370 }
4371
4372 static thread_t
rt_runq_first(rt_queue_t rt_run_queue)4373 rt_runq_first(rt_queue_t rt_run_queue)
4374 {
4375 bitmap_t *map = rt_run_queue->bitmap;
4376 int i = bitmap_first(map, NRTQS);
4377 if (i < 0) {
4378 return THREAD_NULL;
4379 }
4380 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4381 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4382
4383 return next_rt;
4384 }
4385
4386 static void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)4387 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4388 {
4389 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4390
4391 int pri = thread->sched_pri;
4392 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4393 int i = pri - BASEPRI_RTQUEUES;
4394 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4395 bitmap_t *map = rt_run_queue->bitmap;
4396
4397 assert(rt_runq->pri_count > 0);
4398 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4399 uint32_t constraint = RT_CONSTRAINT_NONE;
4400 int ed_index = NOPRI;
4401 remqueue(&thread->runq_links);
4402 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4403 if (--rt_runq->pri_count > 0) {
4404 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4405 earliest_deadline = next_rt->realtime.deadline;
4406 constraint = next_rt->realtime.constraint;
4407 ed_index = i;
4408 } else {
4409 bitmap_clear(map, i);
4410 }
4411 rt_runq->pri_earliest_deadline = earliest_deadline;
4412 rt_runq->pri_constraint = constraint;
4413
4414 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4415 rt_runq = &rt_run_queue->rt_queue_pri[i];
4416 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4417 earliest_deadline = rt_runq->pri_earliest_deadline;
4418 constraint = rt_runq->pri_constraint;
4419 ed_index = i;
4420 }
4421 }
4422 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4423 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4424 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4425 os_atomic_dec(&rt_run_queue->count, relaxed);
4426
4427 thread_clear_runq_locked(thread);
4428
4429 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4430 }
4431
4432 rt_queue_t
sched_rtlocal_runq(processor_set_t pset)4433 sched_rtlocal_runq(processor_set_t pset)
4434 {
4435 return &pset->rt_runq;
4436 }
4437
4438 void
sched_rtlocal_init(processor_set_t pset)4439 sched_rtlocal_init(processor_set_t pset)
4440 {
4441 pset_rt_init(pset);
4442 }
4443
4444 void
sched_rtlocal_queue_shutdown(processor_t processor)4445 sched_rtlocal_queue_shutdown(processor_t processor)
4446 {
4447 processor_set_t pset = processor->processor_set;
4448 thread_t thread;
4449 queue_head_t tqueue;
4450
4451 pset_lock(pset);
4452
4453 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4454 if (bit_count(pset_available_cpumap(pset)) > 0) {
4455 pset_unlock(pset);
4456 return;
4457 }
4458
4459 queue_init(&tqueue);
4460
4461 while (rt_runq_count(pset) > 0) {
4462 thread = rt_runq_dequeue(&pset->rt_runq);
4463 enqueue_tail(&tqueue, &thread->runq_links);
4464 }
4465 sched_update_pset_load_average(pset, 0);
4466 pset_update_rt_stealable_state(pset);
4467 pset_unlock(pset);
4468
4469 qe_foreach_element_safe(thread, &tqueue, runq_links) {
4470 remqueue(&thread->runq_links);
4471
4472 thread_lock(thread);
4473
4474 thread_setrun(thread, SCHED_TAILQ);
4475
4476 thread_unlock(thread);
4477 }
4478 }
4479
4480 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4481 void
sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)4482 sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4483 {
4484 thread_t thread;
4485
4486 pset_node_t node = &pset_node0;
4487 processor_set_t pset = node->psets;
4488
4489 spl_t s = splsched();
4490 do {
4491 while (pset != NULL) {
4492 pset_lock(pset);
4493
4494 bitmap_t *map = pset->rt_runq.bitmap;
4495 for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4496 rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4497
4498 qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4499 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4500 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4501 }
4502 }
4503 }
4504
4505 pset_unlock(pset);
4506
4507 pset = pset->pset_list;
4508 }
4509 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4510 splx(s);
4511 }
4512
4513 int64_t
sched_rtlocal_runq_count_sum(void)4514 sched_rtlocal_runq_count_sum(void)
4515 {
4516 pset_node_t node = &pset_node0;
4517 processor_set_t pset = node->psets;
4518 int64_t count = 0;
4519
4520 do {
4521 while (pset != NULL) {
4522 count += pset->rt_runq.runq_stats.count_sum;
4523
4524 pset = pset->pset_list;
4525 }
4526 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4527
4528 return count;
4529 }
4530
4531 /*
4532 * Called with stealing_pset locked and
4533 * returns with stealing_pset locked
4534 * but the lock will have been dropped
4535 * if a thread is returned.
4536 */
4537 thread_t
sched_rtlocal_steal_thread(processor_set_t stealing_pset,uint64_t earliest_deadline)4538 sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4539 {
4540 if (!sched_allow_rt_steal) {
4541 return THREAD_NULL;
4542 }
4543 pset_map_t pset_map = stealing_pset->node->pset_map;
4544
4545 bit_clear(pset_map, stealing_pset->pset_id);
4546
4547 processor_set_t pset = stealing_pset;
4548
4549 processor_set_t target_pset;
4550 uint64_t target_deadline;
4551
4552 retry:
4553 target_pset = NULL;
4554 target_deadline = earliest_deadline - rt_deadline_epsilon;
4555
4556 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
4557 processor_set_t nset = pset_array[pset_id];
4558
4559 /*
4560 * During startup, while pset_array[] and node->pset_map are still being initialized,
4561 * the update to pset_map may become visible to this cpu before the update to pset_array[].
4562 * It would be good to avoid inserting a memory barrier here that is only needed during startup,
4563 * so just check nset is not NULL instead.
4564 */
4565 if (nset && (nset->stealable_rt_threads_earliest_deadline < target_deadline)) {
4566 target_deadline = nset->stealable_rt_threads_earliest_deadline;
4567 target_pset = nset;
4568 }
4569 }
4570
4571 if (target_pset != NULL) {
4572 pset = change_locked_pset(pset, target_pset);
4573 if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4574 thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
4575 pset_update_rt_stealable_state(pset);
4576 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4577
4578 pset = change_locked_pset(pset, stealing_pset);
4579 return new_thread;
4580 }
4581 pset = change_locked_pset(pset, stealing_pset);
4582 earliest_deadline = rt_runq_earliest_deadline(pset);
4583 goto retry;
4584 }
4585
4586 pset = change_locked_pset(pset, stealing_pset);
4587 return THREAD_NULL;
4588 }
4589
4590 /*
4591 * pset is locked
4592 */
4593 thread_t
sched_rt_choose_thread(processor_set_t pset)4594 sched_rt_choose_thread(processor_set_t pset)
4595 {
4596 processor_t processor = current_processor();
4597
4598 if (SCHED(steal_thread_enabled)(pset)) {
4599 do {
4600 bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4601 if (spill_pending) {
4602 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4603 }
4604 thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4605 if (new_thread != THREAD_NULL) {
4606 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4607 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4608 }
4609 return new_thread;
4610 }
4611 } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4612 }
4613
4614 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4615 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4616 }
4617
4618 if (rt_runq_count(pset) > 0) {
4619 thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4620 assert(new_thread != THREAD_NULL);
4621 pset_update_rt_stealable_state(pset);
4622 return new_thread;
4623 }
4624
4625 return THREAD_NULL;
4626 }
4627
4628 /*
4629 * realtime_queue_insert:
4630 *
4631 * Enqueue a thread for realtime execution.
4632 */
4633 static bool
realtime_queue_insert(processor_t processor,processor_set_t pset,thread_t thread)4634 realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4635 {
4636 pset_assert_locked(pset);
4637
4638 bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4639 pset_update_rt_stealable_state(pset);
4640
4641 return preempt;
4642 }
4643
4644 /*
4645 * realtime_setrun:
4646 *
4647 * Dispatch a thread for realtime execution.
4648 *
4649 * Thread must be locked. Associated pset must
4650 * be locked, and is returned unlocked.
4651 */
4652 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4653 realtime_setrun(
4654 processor_t chosen_processor,
4655 thread_t thread)
4656 {
4657 processor_set_t pset = chosen_processor->processor_set;
4658 pset_assert_locked(pset);
4659 bool pset_is_locked = true;
4660
4661 int n_backup = 0;
4662
4663 if (thread->realtime.constraint <= rt_constraint_threshold) {
4664 n_backup = sched_rt_n_backup_processors;
4665 }
4666 assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4667
4668 int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4669 if (existing_backups > 0) {
4670 n_backup = n_backup - existing_backups;
4671 if (n_backup < 0) {
4672 n_backup = 0;
4673 }
4674 }
4675
4676 sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4677 processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4678
4679 thread->chosen_processor = chosen_processor;
4680
4681 /* <rdar://problem/15102234> */
4682 assert(thread->bound_processor == PROCESSOR_NULL);
4683
4684 realtime_queue_insert(chosen_processor, pset, thread);
4685
4686 processor_t processor = chosen_processor;
4687
4688 int count = 0;
4689 for (int i = 0; i <= n_backup; i++) {
4690 if (i == 0) {
4691 ipi_type[i] = SCHED_IPI_NONE;
4692 ipi_processor[i] = processor;
4693 count++;
4694
4695 ast_t preempt = AST_NONE;
4696 if (thread->sched_pri > processor->current_pri) {
4697 preempt = (AST_PREEMPT | AST_URGENT);
4698 } else if (thread->sched_pri == processor->current_pri) {
4699 if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4700 preempt = (AST_PREEMPT | AST_URGENT);
4701 }
4702 }
4703
4704 if (preempt != AST_NONE) {
4705 if (processor->state == PROCESSOR_IDLE) {
4706 if (processor == current_processor()) {
4707 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4708 ast_on(preempt);
4709
4710 if ((preempt & AST_URGENT) == AST_URGENT) {
4711 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4712 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4713 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4714 }
4715 }
4716
4717 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4718 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4719 }
4720 } else {
4721 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4722 }
4723 } else if (processor->state == PROCESSOR_DISPATCHING) {
4724 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4725 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4726 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4727 }
4728 } else {
4729 if (processor == current_processor()) {
4730 ast_on(preempt);
4731
4732 if ((preempt & AST_URGENT) == AST_URGENT) {
4733 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4734 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4735 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4736 }
4737 }
4738
4739 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4740 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4741 }
4742 } else {
4743 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4744 }
4745 }
4746 } else {
4747 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4748 }
4749 } else {
4750 if (!pset_is_locked) {
4751 pset_lock(pset);
4752 }
4753 ipi_type[i] = SCHED_IPI_NONE;
4754 ipi_processor[i] = PROCESSOR_NULL;
4755 pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]);
4756 if (ipi_processor[i] == PROCESSOR_NULL) {
4757 break;
4758 }
4759 count++;
4760
4761 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4762 ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4763 #if defined(__x86_64__)
4764 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4765 if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4766 processor_t p0 = ipi_processor[0];
4767 processor_t p1 = ipi_processor[1];
4768 assert(p0 && p1);
4769 if (p_is_good(p0) && p_is_good(p1)) {
4770 /*
4771 * Both the chosen processor and the first backup are non-cpu0 primaries,
4772 * so there is no need for a 2nd backup processor.
4773 */
4774 break;
4775 }
4776 }
4777 #endif
4778 }
4779 }
4780
4781 if (pset_is_locked) {
4782 pset_unlock(pset);
4783 }
4784
4785 assert((count > 0) && (count <= (n_backup + 1)));
4786 for (int i = 0; i < count; i++) {
4787 assert(ipi_processor[i] != PROCESSOR_NULL);
4788 sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4789 }
4790 }
4791
4792
4793 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)4794 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4795 thread_t thread, __unused sched_ipi_event_t event)
4796 {
4797 #if defined(CONFIG_SCHED_DEFERRED_AST)
4798 #if CONFIG_THREAD_GROUPS
4799 if (thread) {
4800 struct thread_group *tg = thread_group_get(thread);
4801 if (thread_group_uses_immediate_ipi(tg)) {
4802 return SCHED_IPI_IMMEDIATE;
4803 }
4804 }
4805 #endif /* CONFIG_THREAD_GROUPS */
4806 if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4807 return SCHED_IPI_DEFERRED;
4808 }
4809 #else /* CONFIG_SCHED_DEFERRED_AST */
4810 (void) thread;
4811 panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4812 #endif /* CONFIG_SCHED_DEFERRED_AST */
4813 return SCHED_IPI_NONE;
4814 }
4815
4816 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)4817 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4818 {
4819 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4820 assert(dst != NULL);
4821
4822 processor_set_t pset = dst->processor_set;
4823 if (current_processor() == dst) {
4824 return SCHED_IPI_NONE;
4825 }
4826
4827 bool dst_idle = (dst->state == PROCESSOR_IDLE);
4828 if (dst_idle) {
4829 pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
4830 }
4831
4832 ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4833 switch (ipi_type) {
4834 case SCHED_IPI_NONE:
4835 return SCHED_IPI_NONE;
4836 #if defined(CONFIG_SCHED_DEFERRED_AST)
4837 case SCHED_IPI_DEFERRED:
4838 bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4839 break;
4840 #endif /* CONFIG_SCHED_DEFERRED_AST */
4841 default:
4842 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4843 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4844 dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4845 }
4846 bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4847 break;
4848 }
4849 return ipi_type;
4850 }
4851
4852 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4853 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4854 {
4855 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4856 boolean_t deferred_ipi_supported = false;
4857 processor_set_t pset = dst->processor_set;
4858
4859 #if defined(CONFIG_SCHED_DEFERRED_AST)
4860 deferred_ipi_supported = true;
4861 #endif /* CONFIG_SCHED_DEFERRED_AST */
4862
4863 switch (event) {
4864 case SCHED_IPI_EVENT_SPILL:
4865 case SCHED_IPI_EVENT_SMT_REBAL:
4866 case SCHED_IPI_EVENT_REBALANCE:
4867 case SCHED_IPI_EVENT_BOUND_THR:
4868 case SCHED_IPI_EVENT_RT_PREEMPT:
4869 /*
4870 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4871 * scenarios use immediate IPIs always.
4872 */
4873 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4874 break;
4875 case SCHED_IPI_EVENT_PREEMPT:
4876 /* In the preemption case, use immediate IPIs for RT threads */
4877 if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4878 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4879 break;
4880 }
4881
4882 /*
4883 * For Non-RT threads preemption,
4884 * If the core is active, use immediate IPIs.
4885 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4886 */
4887 if (deferred_ipi_supported && dst_idle) {
4888 return sched_ipi_deferred_policy(pset, dst, thread, event);
4889 }
4890 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4891 break;
4892 default:
4893 panic("Unrecognized scheduler IPI event type %d", event);
4894 }
4895 assert(ipi_type != SCHED_IPI_NONE);
4896 return ipi_type;
4897 }
4898
4899 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)4900 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4901 {
4902 switch (ipi) {
4903 case SCHED_IPI_NONE:
4904 break;
4905 case SCHED_IPI_IDLE:
4906 machine_signal_idle(dst);
4907 break;
4908 case SCHED_IPI_IMMEDIATE:
4909 cause_ast_check(dst);
4910 break;
4911 case SCHED_IPI_DEFERRED:
4912 machine_signal_idle_deferred(dst);
4913 break;
4914 default:
4915 panic("Unrecognized scheduler IPI type: %d", ipi);
4916 }
4917 }
4918
4919 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4920
4921 boolean_t
priority_is_urgent(int priority)4922 priority_is_urgent(int priority)
4923 {
4924 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
4925 }
4926
4927 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4928
4929 /*
4930 * processor_setrun:
4931 *
4932 * Dispatch a thread for execution on a
4933 * processor.
4934 *
4935 * Thread must be locked. Associated pset must
4936 * be locked, and is returned unlocked.
4937 */
4938 static void
processor_setrun(processor_t processor,thread_t thread,integer_t options)4939 processor_setrun(
4940 processor_t processor,
4941 thread_t thread,
4942 integer_t options)
4943 {
4944 processor_set_t pset = processor->processor_set;
4945 pset_assert_locked(pset);
4946 ast_t preempt = AST_NONE;
4947 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4948
4949 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4950
4951 thread->chosen_processor = processor;
4952
4953 /*
4954 * Set preemption mode.
4955 */
4956 #if defined(CONFIG_SCHED_DEFERRED_AST)
4957 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4958 #endif
4959 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4960 preempt = (AST_PREEMPT | AST_URGENT);
4961 } else if (processor->current_is_eagerpreempt) {
4962 preempt = (AST_PREEMPT | AST_URGENT);
4963 } else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4964 if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4965 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4966 } else {
4967 preempt = AST_NONE;
4968 }
4969 } else {
4970 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4971 }
4972
4973 if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
4974 /*
4975 * Having gone to the trouble of forcing this thread off a less preferred core,
4976 * we should force the preferable core to reschedule immediately to give this
4977 * thread a chance to run instead of just sitting on the run queue where
4978 * it may just be stolen back by the idle core we just forced it off.
4979 */
4980 preempt |= AST_PREEMPT;
4981 }
4982
4983 SCHED(processor_enqueue)(processor, thread, options);
4984 sched_update_pset_load_average(pset, 0);
4985
4986 if (preempt != AST_NONE) {
4987 if (processor->state == PROCESSOR_IDLE) {
4988 ipi_action = eExitIdle;
4989 } else if (processor->state == PROCESSOR_DISPATCHING) {
4990 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4991 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4992 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4993 }
4994 } else if ((processor->state == PROCESSOR_RUNNING ||
4995 processor->state == PROCESSOR_SHUTDOWN) &&
4996 (thread->sched_pri >= processor->current_pri)) {
4997 ipi_action = eInterruptRunning;
4998 }
4999 } else {
5000 /*
5001 * New thread is not important enough to preempt what is running, but
5002 * special processor states may need special handling
5003 */
5004 if (processor->state == PROCESSOR_SHUTDOWN &&
5005 thread->sched_pri >= processor->current_pri) {
5006 ipi_action = eInterruptRunning;
5007 } else if (processor->state == PROCESSOR_IDLE) {
5008 ipi_action = eExitIdle;
5009 } else if (processor->state == PROCESSOR_DISPATCHING) {
5010 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5011 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5012 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
5013 }
5014 }
5015 }
5016
5017 if (ipi_action != eDoNothing) {
5018 if (processor == current_processor()) {
5019 if (ipi_action == eExitIdle) {
5020 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
5021 }
5022 if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
5023 ast_on(preempt);
5024 }
5025
5026 if ((preempt & AST_URGENT) == AST_URGENT) {
5027 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5028 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5029 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
5030 }
5031 } else {
5032 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5033 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
5034 }
5035 }
5036
5037 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
5038 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5039 } else {
5040 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5041 }
5042 } else {
5043 sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
5044 ipi_type = sched_ipi_action(processor, thread, event);
5045 }
5046 }
5047
5048 pset_unlock(pset);
5049 sched_ipi_perform(processor, ipi_type);
5050
5051 if (ipi_action != eDoNothing && processor == current_processor()) {
5052 ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
5053 ast_on(new_preempt);
5054 }
5055 }
5056
5057 /*
5058 * choose_next_pset:
5059 *
5060 * Return the next sibling pset containing
5061 * available processors.
5062 *
5063 * Returns the original pset if none other is
5064 * suitable.
5065 */
5066 static processor_set_t
choose_next_pset(processor_set_t pset)5067 choose_next_pset(
5068 processor_set_t pset)
5069 {
5070 processor_set_t nset = pset;
5071
5072 do {
5073 nset = next_pset(nset);
5074
5075 /*
5076 * Sometimes during startup the pset_map can contain a bit
5077 * for a pset that isn't fully published in pset_array because
5078 * the pset_map read isn't an acquire load.
5079 *
5080 * In order to avoid needing an acquire barrier here, just bail
5081 * out.
5082 */
5083 if (nset == PROCESSOR_SET_NULL) {
5084 return pset;
5085 }
5086 } while (nset->online_processor_count < 1 && nset != pset);
5087
5088 return nset;
5089 }
5090
5091 /*
5092 * choose_processor:
5093 *
5094 * Choose a processor for the thread, beginning at
5095 * the pset. Accepts an optional processor hint in
5096 * the pset.
5097 *
5098 * Returns a processor, possibly from a different pset.
5099 *
5100 * The thread must be locked. The pset must be locked,
5101 * and the resulting pset is locked on return.
5102 */
5103 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread)5104 choose_processor(
5105 processor_set_t starting_pset,
5106 processor_t processor,
5107 thread_t thread)
5108 {
5109 processor_set_t pset = starting_pset;
5110 processor_set_t nset;
5111
5112 assert(thread->sched_pri <= MAXPRI);
5113
5114 /*
5115 * Prefer the hinted processor, when appropriate.
5116 */
5117
5118 /* Fold last processor hint from secondary processor to its primary */
5119 if (processor != PROCESSOR_NULL) {
5120 processor = processor->processor_primary;
5121 }
5122
5123 /*
5124 * Only consult platform layer if pset is active, which
5125 * it may not be in some cases when a multi-set system
5126 * is going to sleep.
5127 */
5128 if (pset->online_processor_count) {
5129 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
5130 processor_t mc_processor = machine_choose_processor(pset, processor);
5131 if (mc_processor != PROCESSOR_NULL) {
5132 processor = mc_processor->processor_primary;
5133 }
5134 }
5135 }
5136
5137 /*
5138 * At this point, we may have a processor hint, and we may have
5139 * an initial starting pset. If the hint is not in the pset, or
5140 * if the hint is for a processor in an invalid state, discard
5141 * the hint.
5142 */
5143 if (processor != PROCESSOR_NULL) {
5144 if (processor->processor_set != pset) {
5145 processor = PROCESSOR_NULL;
5146 } else if (!processor->is_recommended) {
5147 processor = PROCESSOR_NULL;
5148 } else {
5149 switch (processor->state) {
5150 case PROCESSOR_START:
5151 case PROCESSOR_SHUTDOWN:
5152 case PROCESSOR_PENDING_OFFLINE:
5153 case PROCESSOR_OFF_LINE:
5154 /*
5155 * Hint is for a processor that cannot support running new threads.
5156 */
5157 processor = PROCESSOR_NULL;
5158 break;
5159 case PROCESSOR_IDLE:
5160 /*
5161 * Hint is for an idle processor. Assume it is no worse than any other
5162 * idle processor. The platform layer had an opportunity to provide
5163 * the "least cost idle" processor above.
5164 */
5165 if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5166 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5167 uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5168 /*
5169 * If the rotation bitmask to force a migration is set for this core and there's an idle core that
5170 * that needn't be avoided, don't continue running on the same core.
5171 */
5172 if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_primary_map != 0)) {
5173 return processor;
5174 }
5175 }
5176 processor = PROCESSOR_NULL;
5177 break;
5178 case PROCESSOR_RUNNING:
5179 case PROCESSOR_DISPATCHING:
5180 /*
5181 * Hint is for an active CPU. This fast-path allows
5182 * realtime threads to preempt non-realtime threads
5183 * to regain their previous executing processor.
5184 */
5185 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5186 if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5187 return processor;
5188 }
5189 processor = PROCESSOR_NULL;
5190 }
5191
5192 /* Otherwise, use hint as part of search below */
5193 break;
5194 default:
5195 processor = PROCESSOR_NULL;
5196 break;
5197 }
5198 }
5199 }
5200
5201 /*
5202 * Iterate through the processor sets to locate
5203 * an appropriate processor. Seed results with
5204 * a last-processor hint, if available, so that
5205 * a search must find something strictly better
5206 * to replace it.
5207 *
5208 * A primary/secondary pair of SMT processors are
5209 * "unpaired" if the primary is busy but its
5210 * corresponding secondary is idle (so the physical
5211 * core has full use of its resources).
5212 */
5213
5214 integer_t lowest_priority = MAXPRI + 1;
5215 integer_t lowest_secondary_priority = MAXPRI + 1;
5216 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
5217 integer_t lowest_idle_secondary_priority = MAXPRI + 1;
5218 integer_t lowest_count = INT_MAX;
5219 processor_t lp_processor = PROCESSOR_NULL;
5220 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
5221 processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
5222 processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
5223 processor_t lc_processor = PROCESSOR_NULL;
5224
5225 if (processor != PROCESSOR_NULL) {
5226 /* All other states should be enumerated above. */
5227 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5228 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5229
5230 lowest_priority = processor->current_pri;
5231 lp_processor = processor;
5232
5233 lowest_count = SCHED(processor_runq_count)(processor);
5234 lc_processor = processor;
5235 }
5236
5237 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5238 pset_node_t node = pset->node;
5239 bool include_ast_urgent_pending_cpus = false;
5240 cpumap_t ast_urgent_pending;
5241 try_again:
5242 ast_urgent_pending = 0;
5243 int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
5244 for (; consider_secondaries < 2; consider_secondaries++) {
5245 pset = change_locked_pset(pset, starting_pset);
5246 do {
5247 cpumap_t available_map = pset_available_cpumap(pset);
5248 if (available_map == 0) {
5249 goto no_available_cpus;
5250 }
5251
5252 processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5253 if (processor) {
5254 return processor;
5255 }
5256
5257 if (consider_secondaries) {
5258 processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5259 if (processor) {
5260 /*
5261 * Instead of looping through all the psets to find the global
5262 * furthest deadline processor, preempt the first candidate found.
5263 * The preempted thread will then find any other available far deadline
5264 * processors to preempt.
5265 */
5266 return processor;
5267 }
5268
5269 ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5270
5271 if (rt_runq_count(pset) < lowest_count) {
5272 int cpuid = bit_first(available_map);
5273 assert(cpuid >= 0);
5274 lc_processor = processor_array[cpuid];
5275 lowest_count = rt_runq_count(pset);
5276 }
5277 }
5278
5279 no_available_cpus:
5280 nset = next_pset(pset);
5281
5282 if (nset != starting_pset) {
5283 pset = change_locked_pset(pset, nset);
5284 }
5285 } while (nset != starting_pset);
5286 }
5287
5288 /* Short cut for single pset nodes */
5289 if (bit_count(node->pset_map) == 1) {
5290 if (lc_processor) {
5291 pset_assert_locked(lc_processor->processor_set);
5292 return lc_processor;
5293 }
5294 } else {
5295 if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5296 /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5297 include_ast_urgent_pending_cpus = true;
5298 goto try_again;
5299 }
5300 }
5301
5302 processor = lc_processor;
5303
5304 if (processor) {
5305 pset = change_locked_pset(pset, processor->processor_set);
5306 /* Check that chosen processor is still usable */
5307 cpumap_t available_map = pset_available_cpumap(pset);
5308 if (bit_test(available_map, processor->cpu_id)) {
5309 return processor;
5310 }
5311
5312 /* processor is no longer usable */
5313 processor = PROCESSOR_NULL;
5314 }
5315
5316 pset_assert_locked(pset);
5317 pset_unlock(pset);
5318 return PROCESSOR_NULL;
5319 }
5320
5321 /* No realtime threads from this point on */
5322 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5323
5324 do {
5325 /*
5326 * Choose an idle processor, in pset traversal order
5327 */
5328 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5329 uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask;
5330
5331 /* there shouldn't be a pending AST if the processor is idle */
5332 assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5333
5334 /*
5335 * Look at the preferred cores first.
5336 */
5337 int cpuid = lsb_next(preferred_idle_primary_map, pset->cpu_preferred_last_chosen);
5338 if (cpuid < 0) {
5339 cpuid = lsb_first(preferred_idle_primary_map);
5340 }
5341 if (cpuid >= 0) {
5342 processor = processor_array[cpuid];
5343 pset->cpu_preferred_last_chosen = cpuid;
5344 return processor;
5345 }
5346
5347 /*
5348 * Look at the cores that don't need to be avoided next.
5349 */
5350 if (pset->perfcontrol_cpu_migration_bitmask != 0) {
5351 uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5352 cpuid = lsb_next(non_avoided_idle_primary_map, pset->cpu_preferred_last_chosen);
5353 if (cpuid < 0) {
5354 cpuid = lsb_first(non_avoided_idle_primary_map);
5355 }
5356 if (cpuid >= 0) {
5357 processor = processor_array[cpuid];
5358 pset->cpu_preferred_last_chosen = cpuid;
5359 return processor;
5360 }
5361 }
5362
5363 /*
5364 * Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available.
5365 */
5366 cpuid = lsb_first(idle_primary_map);
5367 if (cpuid >= 0) {
5368 processor = processor_array[cpuid];
5369 return processor;
5370 }
5371
5372 /*
5373 * Otherwise, enumerate active and idle processors to find primary candidates
5374 * with lower priority/etc.
5375 */
5376
5377 uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5378 pset->recommended_bitmask &
5379 ~pset->pending_AST_URGENT_cpu_mask);
5380
5381 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5382 active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5383 }
5384
5385 active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5386 for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5387 cpuid = ((rotid + pset->last_chosen + 1) & 63);
5388 processor = processor_array[cpuid];
5389
5390 integer_t cpri = processor->current_pri;
5391 processor_t primary = processor->processor_primary;
5392 if (primary != processor) {
5393 /* If primary is running a NO_SMT thread, don't choose its secondary */
5394 if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
5395 if (cpri < lowest_secondary_priority) {
5396 lowest_secondary_priority = cpri;
5397 lp_paired_secondary_processor = processor;
5398 }
5399 }
5400 } else {
5401 if (cpri < lowest_priority) {
5402 lowest_priority = cpri;
5403 lp_processor = processor;
5404 }
5405 }
5406
5407 integer_t ccount = SCHED(processor_runq_count)(processor);
5408 if (ccount < lowest_count) {
5409 lowest_count = ccount;
5410 lc_processor = processor;
5411 }
5412 }
5413
5414 /*
5415 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5416 * the idle primary would have short-circuited the loop above
5417 */
5418 uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5419 ~pset->primary_map &
5420 pset->recommended_bitmask);
5421
5422 /* there shouldn't be a pending AST if the processor is idle */
5423 assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5424 assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5425
5426 for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
5427 processor = processor_array[cpuid];
5428
5429 processor_t cprimary = processor->processor_primary;
5430
5431 integer_t primary_pri = cprimary->current_pri;
5432
5433 /*
5434 * TODO: This should also make the same decisions
5435 * as secondary_can_run_realtime_thread
5436 *
5437 * TODO: Keep track of the pending preemption priority
5438 * of the primary to make this more accurate.
5439 */
5440
5441 /* If the primary is running a no-smt thread, then don't choose its secondary */
5442 if (cprimary->state == PROCESSOR_RUNNING &&
5443 processor_active_thread_no_smt(cprimary)) {
5444 continue;
5445 }
5446
5447 /*
5448 * Find the idle secondary processor with the lowest priority primary
5449 *
5450 * We will choose this processor as a fallback if we find no better
5451 * primary to preempt.
5452 */
5453 if (primary_pri < lowest_idle_secondary_priority) {
5454 lp_idle_secondary_processor = processor;
5455 lowest_idle_secondary_priority = primary_pri;
5456 }
5457
5458 /* Find the the lowest priority active primary with idle secondary */
5459 if (primary_pri < lowest_unpaired_primary_priority) {
5460 /* If the primary processor is offline or starting up, it's not a candidate for this path */
5461 if (cprimary->state != PROCESSOR_RUNNING &&
5462 cprimary->state != PROCESSOR_DISPATCHING) {
5463 continue;
5464 }
5465
5466 if (!cprimary->is_recommended) {
5467 continue;
5468 }
5469
5470 /* if the primary is pending preemption, don't try to re-preempt it */
5471 if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5472 continue;
5473 }
5474
5475 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5476 bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5477 continue;
5478 }
5479
5480 lowest_unpaired_primary_priority = primary_pri;
5481 lp_unpaired_primary_processor = cprimary;
5482 }
5483 }
5484
5485 /*
5486 * We prefer preempting a primary processor over waking up its secondary.
5487 * The secondary will then be woken up by the preempted thread.
5488 */
5489 if (thread->sched_pri > lowest_unpaired_primary_priority) {
5490 pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5491 return lp_unpaired_primary_processor;
5492 }
5493
5494 /*
5495 * We prefer preempting a lower priority active processor over directly
5496 * waking up an idle secondary.
5497 * The preempted thread will then find the idle secondary.
5498 */
5499 if (thread->sched_pri > lowest_priority) {
5500 pset->last_chosen = lp_processor->cpu_id;
5501 return lp_processor;
5502 }
5503
5504 /*
5505 * lc_processor is used to indicate the best processor set run queue
5506 * on which to enqueue a thread when all available CPUs are busy with
5507 * higher priority threads, so try to make sure it is initialized.
5508 */
5509 if (lc_processor == PROCESSOR_NULL) {
5510 cpumap_t available_map = pset_available_cpumap(pset);
5511 cpuid = lsb_first(available_map);
5512 if (cpuid >= 0) {
5513 lc_processor = processor_array[cpuid];
5514 lowest_count = SCHED(processor_runq_count)(lc_processor);
5515 }
5516 }
5517
5518 /*
5519 * Move onto the next processor set.
5520 *
5521 * If all primary processors in this pset are running a higher
5522 * priority thread, move on to next pset. Only when we have
5523 * exhausted the search for primary processors do we
5524 * fall back to secondaries.
5525 */
5526 #if CONFIG_SCHED_EDGE
5527 /*
5528 * The edge scheduler expects a CPU to be selected from the pset it passed in
5529 * as the starting pset for non-RT workloads. The edge migration algorithm
5530 * should already have considered idle CPUs and loads to decide the starting_pset;
5531 * which means that this loop can be short-circuted.
5532 */
5533 nset = starting_pset;
5534 #else /* CONFIG_SCHED_EDGE */
5535 nset = next_pset(pset);
5536 #endif /* CONFIG_SCHED_EDGE */
5537
5538 if (nset != starting_pset) {
5539 pset = change_locked_pset(pset, nset);
5540 }
5541 } while (nset != starting_pset);
5542
5543 /*
5544 * Make sure that we pick a running processor,
5545 * and that the correct processor set is locked.
5546 * Since we may have unlocked the candidate processor's
5547 * pset, it may have changed state.
5548 *
5549 * All primary processors are running a higher priority
5550 * thread, so the only options left are enqueuing on
5551 * the secondary processor that would perturb the least priority
5552 * primary, or the least busy primary.
5553 */
5554
5555 /* lowest_priority is evaluated in the main loops above */
5556 if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5557 processor = lp_idle_secondary_processor;
5558 } else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5559 processor = lp_paired_secondary_processor;
5560 } else if (lc_processor != PROCESSOR_NULL) {
5561 processor = lc_processor;
5562 } else {
5563 processor = PROCESSOR_NULL;
5564 }
5565
5566 if (processor) {
5567 pset = change_locked_pset(pset, processor->processor_set);
5568 /* Check that chosen processor is still usable */
5569 cpumap_t available_map = pset_available_cpumap(pset);
5570 if (bit_test(available_map, processor->cpu_id)) {
5571 pset->last_chosen = processor->cpu_id;
5572 return processor;
5573 }
5574
5575 /* processor is no longer usable */
5576 processor = PROCESSOR_NULL;
5577 }
5578
5579 pset_assert_locked(pset);
5580 pset_unlock(pset);
5581 return PROCESSOR_NULL;
5582 }
5583
5584 /*
5585 * Default implementation of SCHED(choose_node)()
5586 * for single node systems
5587 */
5588 pset_node_t
sched_choose_node(__unused thread_t thread)5589 sched_choose_node(__unused thread_t thread)
5590 {
5591 return &pset_node0;
5592 }
5593
5594 /*
5595 * choose_starting_pset:
5596 *
5597 * Choose a starting processor set for the thread.
5598 * May return a processor hint within the pset.
5599 *
5600 * Returns a starting processor set, to be used by
5601 * choose_processor.
5602 *
5603 * The thread must be locked. The resulting pset is unlocked on return,
5604 * and is chosen without taking any pset locks.
5605 */
5606 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)5607 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5608 {
5609 processor_set_t pset;
5610 processor_t processor = PROCESSOR_NULL;
5611
5612 if (thread->affinity_set != AFFINITY_SET_NULL) {
5613 /*
5614 * Use affinity set policy hint.
5615 */
5616 pset = thread->affinity_set->aset_pset;
5617 } else if (thread->last_processor != PROCESSOR_NULL) {
5618 /*
5619 * Simple (last processor) affinity case.
5620 */
5621 processor = thread->last_processor;
5622 pset = processor->processor_set;
5623 } else {
5624 /*
5625 * No Affinity case:
5626 *
5627 * Utilitize a per task hint to spread threads
5628 * among the available processor sets.
5629 * NRG this seems like the wrong thing to do.
5630 * See also task->pset_hint = pset in thread_setrun()
5631 */
5632 pset = get_threadtask(thread)->pset_hint;
5633 if (pset == PROCESSOR_SET_NULL) {
5634 pset = current_processor()->processor_set;
5635 }
5636
5637 pset = choose_next_pset(pset);
5638 }
5639
5640 if (!bit_test(node->pset_map, pset->pset_id)) {
5641 /* pset is not from this node so choose one that is */
5642 int id = lsb_first(node->pset_map);
5643 if (id < 0) {
5644 /* startup race, so check again under the node lock */
5645 lck_spin_lock(&pset_node_lock);
5646 if (bit_test(node->pset_map, pset->pset_id)) {
5647 id = pset->pset_id;
5648 } else {
5649 id = lsb_first(node->pset_map);
5650 }
5651 lck_spin_unlock(&pset_node_lock);
5652 }
5653 assert(id >= 0);
5654 pset = pset_array[id];
5655 }
5656
5657 if (bit_count(node->pset_map) == 1) {
5658 /* Only a single pset in this node */
5659 goto out;
5660 }
5661
5662 bool avoid_cpu0 = false;
5663
5664 #if defined(__x86_64__)
5665 if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5666 /* Avoid the pset containing cpu0 */
5667 avoid_cpu0 = true;
5668 /* Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ */
5669 assert(bit_test(pset_array[0]->cpu_bitmask, 0));
5670 }
5671 #endif
5672
5673 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5674 pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5675 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5676 if (avoid_cpu0) {
5677 rt_target_map = bit_ror64(rt_target_map, 1);
5678 }
5679 int rotid = lsb_first(rt_target_map);
5680 if (rotid >= 0) {
5681 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5682 pset = pset_array[id];
5683 goto out;
5684 }
5685 }
5686 if (!pset->is_SMT || !sched_allow_rt_smt) {
5687 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5688 goto out;
5689 }
5690 rt_target_map = atomic_load(&node->pset_non_rt_map);
5691 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5692 if (avoid_cpu0) {
5693 rt_target_map = bit_ror64(rt_target_map, 1);
5694 }
5695 int rotid = lsb_first(rt_target_map);
5696 if (rotid >= 0) {
5697 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5698 pset = pset_array[id];
5699 goto out;
5700 }
5701 }
5702 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5703 } else {
5704 pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5705 if (!bit_test(idle_map, pset->pset_id)) {
5706 int next_idle_pset_id = lsb_first(idle_map);
5707 if (next_idle_pset_id >= 0) {
5708 pset = pset_array[next_idle_pset_id];
5709 }
5710 }
5711 }
5712
5713 out:
5714 if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5715 processor = PROCESSOR_NULL;
5716 }
5717 if (processor != PROCESSOR_NULL) {
5718 *processor_hint = processor;
5719 }
5720
5721 assert(pset != NULL);
5722 return pset;
5723 }
5724
5725 /*
5726 * thread_setrun:
5727 *
5728 * Dispatch thread for execution, onto an idle
5729 * processor or run queue, and signal a preemption
5730 * as appropriate.
5731 *
5732 * Thread must be locked.
5733 */
5734 void
thread_setrun(thread_t thread,sched_options_t options)5735 thread_setrun(
5736 thread_t thread,
5737 sched_options_t options)
5738 {
5739 processor_t processor = PROCESSOR_NULL;
5740 processor_set_t pset;
5741
5742 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5743 thread_assert_runq_null(thread);
5744
5745 #if CONFIG_PREADOPT_TG
5746 /* We know that the thread is not in the runq by virtue of being in this
5747 * function and the thread is not self since we are running. We can safely
5748 * resolve the thread group hierarchy and modify the thread's thread group
5749 * here. */
5750 thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
5751 #endif
5752
5753 /*
5754 * Update priority if needed.
5755 */
5756 if (SCHED(can_update_priority)(thread)) {
5757 SCHED(update_priority)(thread);
5758 }
5759 thread->sfi_class = sfi_thread_classify(thread);
5760
5761 if (thread->bound_processor == PROCESSOR_NULL) {
5762 /*
5763 * Unbound case.
5764 *
5765 * Usually, this loop will only be executed once,
5766 * but if CLPC derecommends a processor after it has been chosen,
5767 * or if a processor is shut down after it is chosen,
5768 * choose_processor() may return NULL, so a retry
5769 * may be necessary. A single retry will usually
5770 * be enough, and we can't afford to retry too many times
5771 * because interrupts are disabled.
5772 */
5773 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
5774 for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5775 processor_t processor_hint = PROCESSOR_NULL;
5776 pset_node_t node = SCHED(choose_node)(thread);
5777 processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
5778
5779 pset_lock(starting_pset);
5780
5781 processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5782 if (processor != PROCESSOR_NULL) {
5783 pset = processor->processor_set;
5784 pset_assert_locked(pset);
5785 break;
5786 }
5787 }
5788 /*
5789 * If choose_processor() still returns NULL,
5790 * which is very unlikely,
5791 * choose the master_processor, which is always
5792 * safe to choose.
5793 */
5794 if (processor == PROCESSOR_NULL) {
5795 /* Choose fallback processor */
5796 processor = master_processor;
5797 pset = processor->processor_set;
5798 pset_lock(pset);
5799 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5800 }
5801 task_t task = get_threadtask(thread);
5802 if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5803 task->pset_hint = pset; /* NRG this is done without holding the task lock */
5804 }
5805 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5806 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5807 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5808 } else {
5809 /*
5810 * Bound case:
5811 *
5812 * Unconditionally dispatch on the processor.
5813 */
5814 processor = thread->bound_processor;
5815 pset = processor->processor_set;
5816 pset_lock(pset);
5817
5818 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5819 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5820 }
5821
5822 /*
5823 * Dispatch the thread on the chosen processor.
5824 * TODO: This should be based on sched_mode, not sched_pri
5825 */
5826 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5827 realtime_setrun(processor, thread);
5828 } else {
5829 processor_setrun(processor, thread, options);
5830 }
5831 /* pset is now unlocked */
5832 if (thread->bound_processor == PROCESSOR_NULL) {
5833 SCHED(check_spill)(pset, thread);
5834 }
5835 }
5836
5837 processor_set_t
task_choose_pset(task_t task)5838 task_choose_pset(
5839 task_t task)
5840 {
5841 processor_set_t pset = task->pset_hint;
5842
5843 if (pset != PROCESSOR_SET_NULL) {
5844 pset = choose_next_pset(pset);
5845 }
5846
5847 return pset;
5848 }
5849
5850 /*
5851 * Check for a preemption point in
5852 * the current context.
5853 *
5854 * Called at splsched with thread locked.
5855 */
5856 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)5857 csw_check(
5858 thread_t thread,
5859 processor_t processor,
5860 ast_t check_reason)
5861 {
5862 processor_set_t pset = processor->processor_set;
5863
5864 assert(thread == processor->active_thread);
5865
5866 pset_lock(pset);
5867
5868 processor_state_update_from_thread(processor, thread, true);
5869
5870 ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5871
5872 /* Acknowledge the IPI if we decided not to preempt */
5873
5874 if ((preempt & AST_URGENT) == 0) {
5875 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5876 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5877 }
5878 }
5879
5880 if ((preempt & AST_PREEMPT) == 0) {
5881 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5882 }
5883
5884 pset_unlock(pset);
5885
5886 return update_pending_nonurgent_preemption(processor, preempt);
5887 }
5888
5889 void
clear_pending_nonurgent_preemption(processor_t processor)5890 clear_pending_nonurgent_preemption(processor_t processor)
5891 {
5892 if (!processor->pending_nonurgent_preemption) {
5893 return;
5894 }
5895
5896 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_END);
5897
5898 processor->pending_nonurgent_preemption = false;
5899 running_timer_clear(processor, RUNNING_TIMER_PREEMPT);
5900 }
5901
5902 ast_t
update_pending_nonurgent_preemption(processor_t processor,ast_t reason)5903 update_pending_nonurgent_preemption(processor_t processor, ast_t reason)
5904 {
5905 if ((reason & (AST_URGENT | AST_PREEMPT)) != (AST_PREEMPT)) {
5906 clear_pending_nonurgent_preemption(processor);
5907 return reason;
5908 }
5909
5910 if (nonurgent_preemption_timer_abs == 0) {
5911 /* Preemption timer not enabled */
5912 return reason;
5913 }
5914
5915 if (current_thread()->state & TH_IDLE) {
5916 /* idle threads don't need nonurgent preemption */
5917 return reason;
5918 }
5919
5920 if (processor->pending_nonurgent_preemption) {
5921 /* Timer is already armed, no need to do it again */
5922 return reason;
5923 }
5924
5925 if (ml_did_interrupt_userspace()) {
5926 /*
5927 * We're preempting userspace here, so we don't need
5928 * to defer the preemption. Force AST_URGENT
5929 * so that we can avoid arming this timer without risking
5930 * ast_taken_user deciding to spend too long in kernel
5931 * space to handle other ASTs.
5932 */
5933
5934 return reason | AST_URGENT;
5935 }
5936
5937 /*
5938 * We've decided to do a nonurgent preemption when running in
5939 * kernelspace. We defer the preemption until reaching userspace boundary
5940 * to give a grace period for locks etc to be dropped and to reach
5941 * a clean preemption point, so that the preempting thread doesn't
5942 * always immediately hit the lock that the waking thread still holds.
5943 *
5944 * Arm a timer to enforce that the preemption executes within a bounded
5945 * time if the thread doesn't block or return to userspace quickly.
5946 */
5947
5948 processor->pending_nonurgent_preemption = true;
5949 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_START,
5950 reason);
5951
5952 uint64_t now = mach_absolute_time();
5953
5954 uint64_t deadline = now + nonurgent_preemption_timer_abs;
5955
5956 running_timer_enter(processor, RUNNING_TIMER_PREEMPT, NULL,
5957 now, deadline);
5958
5959 return reason;
5960 }
5961
5962 /*
5963 * Check for preemption at splsched with
5964 * pset and thread locked
5965 */
5966 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)5967 csw_check_locked(
5968 thread_t thread,
5969 processor_t processor,
5970 processor_set_t pset,
5971 ast_t check_reason)
5972 {
5973 /*
5974 * If the current thread is running on a processor that is no longer recommended,
5975 * urgently preempt it, at which point thread_select() should
5976 * try to idle the processor and re-dispatch the thread to a recommended processor.
5977 */
5978 if (!processor->is_recommended) {
5979 return check_reason | AST_PREEMPT | AST_URGENT;
5980 }
5981
5982 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5983 return check_reason | AST_PREEMPT | AST_URGENT;
5984 }
5985
5986 if (rt_runq_count(pset) > 0) {
5987 if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5988 return check_reason | AST_PREEMPT | AST_URGENT;
5989 } else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) {
5990 return check_reason | AST_PREEMPT | AST_URGENT;
5991 } else {
5992 return check_reason | AST_PREEMPT;
5993 }
5994 }
5995
5996 ast_t result = SCHED(processor_csw_check)(processor);
5997 if (result != AST_NONE) {
5998 return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
5999 }
6000
6001 /*
6002 * Same for avoid-processor
6003 *
6004 * TODO: Should these set AST_REBALANCE?
6005 */
6006 if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, check_reason)) {
6007 return check_reason | AST_PREEMPT;
6008 }
6009
6010 /*
6011 * Even though we could continue executing on this processor, a
6012 * secondary SMT core should try to shed load to another primary core.
6013 *
6014 * TODO: Should this do the same check that thread_select does? i.e.
6015 * if no bound threads target this processor, and idle primaries exist, preempt
6016 * The case of RT threads existing is already taken care of above
6017 */
6018
6019 if (processor->current_pri < BASEPRI_RTQUEUES &&
6020 processor->processor_primary != processor) {
6021 return check_reason | AST_PREEMPT;
6022 }
6023
6024 if (thread->state & TH_SUSP) {
6025 return check_reason | AST_PREEMPT;
6026 }
6027
6028 #if CONFIG_SCHED_SFI
6029 /*
6030 * Current thread may not need to be preempted, but maybe needs
6031 * an SFI wait?
6032 */
6033 result = sfi_thread_needs_ast(thread, NULL);
6034 if (result != AST_NONE) {
6035 return result;
6036 }
6037 #endif
6038
6039 return AST_NONE;
6040 }
6041
6042 /*
6043 * Handle preemption IPI or IPI in response to setting an AST flag
6044 * Triggered by cause_ast_check
6045 * Called at splsched
6046 */
6047 void
ast_check(processor_t processor)6048 ast_check(processor_t processor)
6049 {
6050 smr_ack_ipi();
6051
6052 if (processor->state != PROCESSOR_RUNNING &&
6053 processor->state != PROCESSOR_SHUTDOWN) {
6054 return;
6055 }
6056
6057 SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6058 MACH_SCHED_AST_CHECK) | DBG_FUNC_START);
6059
6060 thread_t thread = processor->active_thread;
6061
6062 assert(thread == current_thread());
6063
6064 /*
6065 * Pairs with task_restartable_ranges_synchronize
6066 */
6067 thread_lock(thread);
6068
6069 thread_reset_pcs_ack_IPI(thread);
6070
6071 /*
6072 * Propagate thread ast to processor.
6073 * (handles IPI in response to setting AST flag)
6074 */
6075 ast_propagate(thread);
6076
6077 /*
6078 * Stash the old urgency and perfctl values to find out if
6079 * csw_check updates them.
6080 */
6081 thread_urgency_t old_urgency = processor->current_urgency;
6082 perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
6083
6084 ast_t preempt;
6085
6086 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6087 ast_on(preempt);
6088 }
6089
6090 if (old_urgency != processor->current_urgency) {
6091 /*
6092 * Urgency updates happen with the thread lock held (ugh).
6093 * TODO: This doesn't notice QoS changes...
6094 */
6095 uint64_t urgency_param1, urgency_param2;
6096
6097 thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
6098 thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
6099 }
6100
6101 thread_unlock(thread);
6102
6103 if (old_perfctl_class != processor->current_perfctl_class) {
6104 /*
6105 * We updated the perfctl class of this thread from another core.
6106 * Let CLPC know that the currently running thread has a new
6107 * class.
6108 */
6109
6110 machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
6111 mach_approximate_time(), 0, thread);
6112 }
6113
6114 SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6115 MACH_SCHED_AST_CHECK) | DBG_FUNC_END, preempt);
6116 }
6117
6118
6119 void
thread_preempt_expire(timer_call_param_t p0,__unused timer_call_param_t p1)6120 thread_preempt_expire(
6121 timer_call_param_t p0,
6122 __unused timer_call_param_t p1)
6123 {
6124 processor_t processor = p0;
6125
6126 assert(processor == current_processor());
6127 assert(p1 == NULL);
6128
6129 thread_t thread = current_thread();
6130
6131 /*
6132 * This is set and cleared by the current core, so we will
6133 * never see a race with running timer expiration
6134 */
6135 assert(processor->pending_nonurgent_preemption);
6136
6137 clear_pending_nonurgent_preemption(processor);
6138
6139 thread_lock(thread);
6140
6141 /*
6142 * Check again to see if it's still worth a
6143 * context switch, but this time force enable kernel preemption
6144 */
6145
6146 ast_t preempt = csw_check(thread, processor, AST_URGENT);
6147
6148 if (preempt) {
6149 ast_on(preempt);
6150 }
6151
6152 thread_unlock(thread);
6153
6154 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt);
6155 }
6156
6157
6158 /*
6159 * set_sched_pri:
6160 *
6161 * Set the scheduled priority of the specified thread.
6162 *
6163 * This may cause the thread to change queues.
6164 *
6165 * Thread must be locked.
6166 */
6167 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)6168 set_sched_pri(
6169 thread_t thread,
6170 int16_t new_priority,
6171 set_sched_pri_options_t options)
6172 {
6173 bool is_current_thread = (thread == current_thread());
6174 bool removed_from_runq = false;
6175 bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
6176
6177 int16_t old_priority = thread->sched_pri;
6178
6179 /* If we're already at this priority, no need to mess with the runqueue */
6180 if (new_priority == old_priority) {
6181 #if CONFIG_SCHED_CLUTCH
6182 /* For the first thread in the system, the priority is correct but
6183 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
6184 * scheduler relies on the bucket being set for all threads, update
6185 * its bucket here.
6186 */
6187 if (thread->th_sched_bucket == TH_BUCKET_RUN) {
6188 assert(thread == vm_pageout_scan_thread);
6189 SCHED(update_thread_bucket)(thread);
6190 }
6191 #endif /* CONFIG_SCHED_CLUTCH */
6192
6193 return;
6194 }
6195
6196 if (is_current_thread) {
6197 assert(thread->state & TH_RUN);
6198 thread_assert_runq_null(thread);
6199 } else {
6200 removed_from_runq = thread_run_queue_remove(thread);
6201 }
6202
6203 thread->sched_pri = new_priority;
6204
6205 #if CONFIG_SCHED_CLUTCH
6206 /*
6207 * Since for the clutch scheduler, the thread's bucket determines its runq
6208 * in the hierarchy it is important to update the bucket when the thread
6209 * lock is held and the thread has been removed from the runq hierarchy.
6210 */
6211 SCHED(update_thread_bucket)(thread);
6212
6213 #endif /* CONFIG_SCHED_CLUTCH */
6214
6215 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
6216 (uintptr_t)thread_tid(thread),
6217 thread->base_pri,
6218 thread->sched_pri,
6219 thread->sched_usage,
6220 0);
6221
6222 if (removed_from_runq) {
6223 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
6224 } else if (is_current_thread) {
6225 processor_t processor = thread->last_processor;
6226 assert(processor == current_processor());
6227
6228 thread_urgency_t old_urgency = processor->current_urgency;
6229
6230 /*
6231 * When dropping in priority, check if the thread no longer belongs on core.
6232 * If a thread raises its own priority, don't aggressively rebalance it.
6233 * <rdar://problem/31699165>
6234 *
6235 * csw_check does a processor_state_update_from_thread, but
6236 * we should do our own if we're being lazy.
6237 */
6238 if (!lazy_update && new_priority < old_priority) {
6239 ast_t preempt;
6240
6241 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6242 ast_on(preempt);
6243 }
6244 } else {
6245 processor_state_update_from_thread(processor, thread, false);
6246 }
6247
6248 /*
6249 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
6250 * class alterations from user space to occur relatively infrequently, hence
6251 * those are lazily handled. QoS classes have distinct priority bands, and QoS
6252 * inheritance is expected to involve priority changes.
6253 */
6254 if (processor->current_urgency != old_urgency) {
6255 uint64_t urgency_param1, urgency_param2;
6256
6257 thread_urgency_t new_urgency = thread_get_urgency(thread,
6258 &urgency_param1, &urgency_param2);
6259
6260 thread_tell_urgency(new_urgency, urgency_param1,
6261 urgency_param2, 0, thread);
6262 }
6263
6264 /* TODO: only call this if current_perfctl_class changed */
6265 uint64_t ctime = mach_approximate_time();
6266 machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
6267 } else if (thread->state & TH_RUN) {
6268 processor_t processor = thread->last_processor;
6269
6270 if (!lazy_update &&
6271 processor != PROCESSOR_NULL &&
6272 processor != current_processor() &&
6273 processor->active_thread == thread) {
6274 cause_ast_check(processor);
6275 }
6276 }
6277 }
6278
6279 /*
6280 * thread_run_queue_remove_for_handoff
6281 *
6282 * Pull a thread or its (recursive) push target out of the runqueue
6283 * so that it is ready for thread_run()
6284 *
6285 * Called at splsched
6286 *
6287 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6288 * This may be different than the thread that was passed in.
6289 */
6290 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)6291 thread_run_queue_remove_for_handoff(thread_t thread)
6292 {
6293 thread_t pulled_thread = THREAD_NULL;
6294
6295 thread_lock(thread);
6296
6297 /*
6298 * Check that the thread is not bound to a different processor,
6299 * NO_SMT flag is not set on the thread, cluster type of
6300 * processor matches with thread if the thread is pinned to a
6301 * particular cluster and that realtime is not involved.
6302 *
6303 * Next, pull it off its run queue. If it doesn't come, it's not eligible.
6304 */
6305 processor_t processor = current_processor();
6306 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6307 && (!thread_no_smt(thread))
6308 && (processor->current_pri < BASEPRI_RTQUEUES)
6309 && (thread->sched_pri < BASEPRI_RTQUEUES)
6310 #if __AMP__
6311 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6312 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6313 #endif /* __AMP__ */
6314 ) {
6315 if (thread_run_queue_remove(thread)) {
6316 pulled_thread = thread;
6317 }
6318 }
6319
6320 thread_unlock(thread);
6321
6322 return pulled_thread;
6323 }
6324
6325 /*
6326 * thread_prepare_for_handoff
6327 *
6328 * Make the thread ready for handoff.
6329 * If the thread was runnable then pull it off the runq, if the thread could
6330 * not be pulled, return NULL.
6331 *
6332 * If the thread was woken up from wait for handoff, make sure it is not bound to
6333 * different processor.
6334 *
6335 * Called at splsched
6336 *
6337 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6338 * This may be different than the thread that was passed in.
6339 */
6340 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)6341 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6342 {
6343 thread_t pulled_thread = THREAD_NULL;
6344
6345 if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6346 processor_t processor = current_processor();
6347 thread_lock(thread);
6348
6349 /*
6350 * Check that the thread is not bound to a different processor,
6351 * NO_SMT flag is not set on the thread and cluster type of
6352 * processor matches with thread if the thread is pinned to a
6353 * particular cluster. Call setrun instead if above conditions
6354 * are not satisfied.
6355 */
6356 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6357 && (!thread_no_smt(thread))
6358 #if __AMP__
6359 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6360 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6361 #endif /* __AMP__ */
6362 ) {
6363 pulled_thread = thread;
6364 } else {
6365 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
6366 }
6367 thread_unlock(thread);
6368 } else {
6369 pulled_thread = thread_run_queue_remove_for_handoff(thread);
6370 }
6371
6372 return pulled_thread;
6373 }
6374
6375 /*
6376 * thread_run_queue_remove:
6377 *
6378 * Remove a thread from its current run queue and
6379 * return TRUE if successful.
6380 *
6381 * Thread must be locked.
6382 *
6383 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6384 * run queues because the caller locked the thread. Otherwise
6385 * the thread is on a run queue, but could be chosen for dispatch
6386 * and removed by another processor under a different lock, which
6387 * will set thread->runq to PROCESSOR_NULL.
6388 *
6389 * Hence the thread select path must not rely on anything that could
6390 * be changed under the thread lock after calling this function,
6391 * most importantly thread->sched_pri.
6392 */
6393 boolean_t
thread_run_queue_remove(thread_t thread)6394 thread_run_queue_remove(
6395 thread_t thread)
6396 {
6397 boolean_t removed = FALSE;
6398
6399 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6400 /* Thread isn't runnable */
6401 thread_assert_runq_null(thread);
6402 return FALSE;
6403 }
6404
6405 processor_t processor = thread_get_runq(thread);
6406 if (processor == PROCESSOR_NULL) {
6407 /*
6408 * The thread is either not on the runq,
6409 * or is in the midst of being removed from the runq.
6410 *
6411 * runq is set to NULL under the pset lock, not the thread
6412 * lock, so the thread may still be in the process of being dequeued
6413 * from the runq. It will wait in invoke for the thread lock to be
6414 * dropped.
6415 */
6416
6417 return FALSE;
6418 }
6419
6420 if (thread->sched_pri < BASEPRI_RTQUEUES) {
6421 return SCHED(processor_queue_remove)(processor, thread);
6422 }
6423
6424 processor_set_t pset = processor->processor_set;
6425
6426 pset_lock(pset);
6427
6428 /*
6429 * Must re-read the thread runq after acquiring the pset lock, in
6430 * case another core swooped in before us to dequeue the thread.
6431 */
6432 if (thread_get_runq_locked(thread) != PROCESSOR_NULL) {
6433 /*
6434 * Thread is on the RT run queue and we have a lock on
6435 * that run queue.
6436 */
6437 rt_runq_remove(SCHED(rt_runq)(pset), thread);
6438 pset_update_rt_stealable_state(pset);
6439
6440 removed = TRUE;
6441 }
6442
6443 pset_unlock(pset);
6444
6445 return removed;
6446 }
6447
6448 /*
6449 * Put the thread back where it goes after a thread_run_queue_remove
6450 *
6451 * Thread must have been removed under the same thread lock hold
6452 *
6453 * thread locked, at splsched
6454 */
6455 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)6456 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6457 {
6458 thread_assert_runq_null(thread);
6459 assert(thread->state & (TH_RUN));
6460
6461 thread_setrun(thread, options);
6462 }
6463
6464 void
sys_override_cpu_throttle(boolean_t enable_override)6465 sys_override_cpu_throttle(boolean_t enable_override)
6466 {
6467 if (enable_override) {
6468 cpu_throttle_enabled = 0;
6469 } else {
6470 cpu_throttle_enabled = 1;
6471 }
6472 }
6473
6474 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)6475 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6476 {
6477 uint64_t urgency_param1 = 0, urgency_param2 = 0;
6478 task_t task = get_threadtask_early(thread);
6479
6480 thread_urgency_t urgency;
6481
6482 if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6483 urgency_param1 = 0;
6484 urgency_param2 = 0;
6485
6486 urgency = THREAD_URGENCY_NONE;
6487 } else if (thread->sched_mode == TH_MODE_REALTIME) {
6488 urgency_param1 = thread->realtime.period;
6489 urgency_param2 = thread->realtime.deadline;
6490
6491 urgency = THREAD_URGENCY_REAL_TIME;
6492 } else if (cpu_throttle_enabled &&
6493 (thread->sched_pri <= MAXPRI_THROTTLE) &&
6494 (thread->base_pri <= MAXPRI_THROTTLE)) {
6495 /*
6496 * Threads that are running at low priority but are not
6497 * tagged with a specific QoS are separated out from
6498 * the "background" urgency. Performance management
6499 * subsystem can decide to either treat these threads
6500 * as normal threads or look at other signals like thermal
6501 * levels for optimal power/perf tradeoffs for a platform.
6502 */
6503 boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6504 boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6505
6506 /*
6507 * Background urgency applied when thread priority is
6508 * MAXPRI_THROTTLE or lower and thread is not promoted
6509 * and thread has a QoS specified
6510 */
6511 urgency_param1 = thread->sched_pri;
6512 urgency_param2 = thread->base_pri;
6513
6514 if (thread_lacks_qos && !task_is_suppressed) {
6515 urgency = THREAD_URGENCY_LOWPRI;
6516 } else {
6517 urgency = THREAD_URGENCY_BACKGROUND;
6518 }
6519 } else {
6520 /* For otherwise unclassified threads, report throughput QoS parameters */
6521 urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6522 urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6523 urgency = THREAD_URGENCY_NORMAL;
6524 }
6525
6526 if (arg1 != NULL) {
6527 *arg1 = urgency_param1;
6528 }
6529 if (arg2 != NULL) {
6530 *arg2 = urgency_param2;
6531 }
6532
6533 return urgency;
6534 }
6535
6536 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)6537 thread_get_perfcontrol_class(thread_t thread)
6538 {
6539 /* Special case handling */
6540 if (thread->state & TH_IDLE) {
6541 return PERFCONTROL_CLASS_IDLE;
6542 }
6543
6544 if (thread->sched_mode == TH_MODE_REALTIME) {
6545 return PERFCONTROL_CLASS_REALTIME;
6546 }
6547
6548 /* perfcontrol_class based on base_pri */
6549 if (thread->base_pri <= MAXPRI_THROTTLE) {
6550 return PERFCONTROL_CLASS_BACKGROUND;
6551 } else if (thread->base_pri <= BASEPRI_UTILITY) {
6552 return PERFCONTROL_CLASS_UTILITY;
6553 } else if (thread->base_pri <= BASEPRI_DEFAULT) {
6554 return PERFCONTROL_CLASS_NONUI;
6555 } else if (thread->base_pri <= BASEPRI_USER_INITIATED) {
6556 return PERFCONTROL_CLASS_USER_INITIATED;
6557 } else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6558 return PERFCONTROL_CLASS_UI;
6559 } else {
6560 if (get_threadtask(thread) == kernel_task) {
6561 /*
6562 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6563 * All other lower priority kernel threads should be treated
6564 * as regular threads for performance control purposes.
6565 */
6566 return PERFCONTROL_CLASS_KERNEL;
6567 }
6568 return PERFCONTROL_CLASS_ABOVEUI;
6569 }
6570 }
6571
6572 /*
6573 * This is the processor idle loop, which just looks for other threads
6574 * to execute. Processor idle threads invoke this without supplying a
6575 * current thread to idle without an asserted wait state.
6576 *
6577 * Returns a the next thread to execute if dispatched directly.
6578 */
6579
6580 #if 0
6581 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6582 #else
6583 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6584 #endif
6585
6586 #if (DEVELOPMENT || DEBUG)
6587 int sched_idle_delay_cpuid = -1;
6588 #endif
6589
6590 thread_t
processor_idle(thread_t thread,processor_t processor)6591 processor_idle(
6592 thread_t thread,
6593 processor_t processor)
6594 {
6595 processor_set_t pset = processor->processor_set;
6596 struct recount_snap snap = { 0 };
6597
6598 (void)splsched();
6599
6600 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6601 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6602 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6603
6604 SCHED_STATS_INC(idle_transitions);
6605 assert(processor->running_timers_active == false);
6606
6607 recount_snapshot(&snap);
6608 recount_processor_idle(&processor->pr_recount, &snap);
6609
6610 while (1) {
6611 /*
6612 * Ensure that updates to my processor and pset state,
6613 * made by the IPI source processor before sending the IPI,
6614 * are visible on this processor now (even though we don't
6615 * take the pset lock yet).
6616 */
6617 atomic_thread_fence(memory_order_acquire);
6618
6619 if (processor->state != PROCESSOR_IDLE) {
6620 break;
6621 }
6622 if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6623 break;
6624 }
6625 #if defined(CONFIG_SCHED_DEFERRED_AST)
6626 if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6627 break;
6628 }
6629 #endif
6630 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6631 break;
6632 }
6633
6634 if (processor->is_recommended && (processor->processor_primary == processor)) {
6635 if (rt_runq_count(pset)) {
6636 break;
6637 }
6638 } else {
6639 if (SCHED(processor_bound_count)(processor)) {
6640 break;
6641 }
6642 }
6643
6644 IDLE_KERNEL_DEBUG_CONSTANT(
6645 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6646
6647 machine_track_platform_idle(TRUE);
6648
6649 machine_idle();
6650 /* returns with interrupts enabled */
6651
6652 machine_track_platform_idle(FALSE);
6653
6654 #if (DEVELOPMENT || DEBUG)
6655 if (processor->cpu_id == sched_idle_delay_cpuid) {
6656 delay(500);
6657 }
6658 #endif
6659
6660 (void)splsched();
6661
6662 atomic_thread_fence(memory_order_acquire);
6663
6664 IDLE_KERNEL_DEBUG_CONSTANT(
6665 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6666
6667 /*
6668 * Check if we should call sched_timeshare_consider_maintenance() here.
6669 * The CPU was woken out of idle due to an interrupt and we should do the
6670 * call only if the processor is still idle. If the processor is non-idle,
6671 * the threads running on the processor would do the call as part of
6672 * context swithing.
6673 */
6674 if (processor->state == PROCESSOR_IDLE) {
6675 sched_timeshare_consider_maintenance(mach_absolute_time(), true);
6676 }
6677
6678 if (!SCHED(processor_queue_empty)(processor)) {
6679 /* Secondary SMT processors respond to directed wakeups
6680 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6681 */
6682 if (processor->processor_primary == processor) {
6683 break;
6684 }
6685 }
6686 }
6687
6688 recount_snapshot(&snap);
6689 recount_processor_run(&processor->pr_recount, &snap);
6690 smr_cpu_join(processor, snap.rsn_time_mach);
6691
6692 ast_t reason = AST_NONE;
6693
6694 /* We're handling all scheduling AST's */
6695 ast_off(AST_SCHEDULING);
6696
6697 /*
6698 * thread_select will move the processor from dispatching to running,
6699 * or put it in idle if there's nothing to do.
6700 */
6701 thread_t cur_thread = current_thread();
6702
6703 thread_lock(cur_thread);
6704 thread_t new_thread = thread_select(cur_thread, processor, &reason);
6705 thread_unlock(cur_thread);
6706
6707 assert(processor->running_timers_active == false);
6708
6709 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6710 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6711 (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6712
6713 return new_thread;
6714 }
6715
6716 /*
6717 * Each processor has a dedicated thread which
6718 * executes the idle loop when there is no suitable
6719 * previous context.
6720 *
6721 * This continuation is entered with interrupts disabled.
6722 */
6723 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)6724 idle_thread(__assert_only void* parameter,
6725 __unused wait_result_t result)
6726 {
6727 assert(ml_get_interrupts_enabled() == FALSE);
6728 assert(parameter == NULL);
6729
6730 processor_t processor = current_processor();
6731
6732 smr_cpu_leave(processor, processor->last_dispatch);
6733
6734 /*
6735 * Ensure that anything running in idle context triggers
6736 * preemption-disabled checks.
6737 */
6738 disable_preemption_without_measurements();
6739
6740 /*
6741 * Enable interrupts temporarily to handle any pending interrupts
6742 * or IPIs before deciding to sleep
6743 */
6744 spllo();
6745
6746 thread_t new_thread = processor_idle(THREAD_NULL, processor);
6747 /* returns with interrupts disabled */
6748
6749 enable_preemption();
6750
6751 if (new_thread != THREAD_NULL) {
6752 thread_run(processor->idle_thread,
6753 idle_thread, NULL, new_thread);
6754 /*NOTREACHED*/
6755 }
6756
6757 thread_block(idle_thread);
6758 /*NOTREACHED*/
6759 }
6760
6761 kern_return_t
idle_thread_create(processor_t processor)6762 idle_thread_create(
6763 processor_t processor)
6764 {
6765 kern_return_t result;
6766 thread_t thread;
6767 spl_t s;
6768 char name[MAXTHREADNAMESIZE];
6769
6770 result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
6771 if (result != KERN_SUCCESS) {
6772 return result;
6773 }
6774
6775 snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6776 thread_set_thread_name(thread, name);
6777
6778 s = splsched();
6779 thread_lock(thread);
6780 thread->bound_processor = processor;
6781 processor->idle_thread = thread;
6782 thread->sched_pri = thread->base_pri = IDLEPRI;
6783 thread->state = (TH_RUN | TH_IDLE);
6784 thread->options |= TH_OPT_IDLE_THREAD;
6785 thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6786 thread_unlock(thread);
6787 splx(s);
6788
6789 thread_deallocate(thread);
6790
6791 return KERN_SUCCESS;
6792 }
6793
6794 static void sched_update_powered_cores_continue(void);
6795
6796 /*
6797 * sched_startup:
6798 *
6799 * Kicks off scheduler services.
6800 *
6801 * Called at splsched.
6802 */
6803 void
sched_startup(void)6804 sched_startup(void)
6805 {
6806 kern_return_t result;
6807 thread_t thread;
6808
6809 simple_lock_init(&sched_vm_group_list_lock, 0);
6810
6811 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
6812 NULL, MAXPRI_KERNEL, &thread);
6813 if (result != KERN_SUCCESS) {
6814 panic("sched_startup");
6815 }
6816
6817 thread_deallocate(thread);
6818
6819 assert_thread_magic(thread);
6820
6821 /*
6822 * Yield to the sched_init_thread once, to
6823 * initialize our own thread after being switched
6824 * back to.
6825 *
6826 * The current thread is the only other thread
6827 * active at this point.
6828 */
6829 thread_block(THREAD_CONTINUE_NULL);
6830
6831 result = kernel_thread_start_priority((thread_continue_t)sched_update_powered_cores_continue,
6832 NULL, MAXPRI_KERNEL, &thread);
6833 if (result != KERN_SUCCESS) {
6834 panic("sched_startup");
6835 }
6836
6837 thread_deallocate(thread);
6838
6839 assert_thread_magic(thread);
6840 }
6841
6842 #if __arm64__
6843 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6844 #endif /* __arm64__ */
6845
6846
6847 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6848
6849 static volatile uint64_t sched_maintenance_deadline;
6850 static uint64_t sched_tick_last_abstime;
6851 static uint64_t sched_tick_delta;
6852 uint64_t sched_tick_max_delta;
6853
6854
6855 /*
6856 * sched_init_thread:
6857 *
6858 * Perform periodic bookkeeping functions about ten
6859 * times per second.
6860 */
6861 void
sched_timeshare_maintenance_continue(void)6862 sched_timeshare_maintenance_continue(void)
6863 {
6864 uint64_t sched_tick_ctime, late_time;
6865
6866 struct sched_update_scan_context scan_context = {
6867 .earliest_bg_make_runnable_time = UINT64_MAX,
6868 .earliest_normal_make_runnable_time = UINT64_MAX,
6869 .earliest_rt_make_runnable_time = UINT64_MAX
6870 };
6871
6872 sched_tick_ctime = mach_absolute_time();
6873
6874 if (__improbable(sched_tick_last_abstime == 0)) {
6875 sched_tick_last_abstime = sched_tick_ctime;
6876 late_time = 0;
6877 sched_tick_delta = 1;
6878 } else {
6879 late_time = sched_tick_ctime - sched_tick_last_abstime;
6880 sched_tick_delta = late_time / sched_tick_interval;
6881 /* Ensure a delta of 1, since the interval could be slightly
6882 * smaller than the sched_tick_interval due to dispatch
6883 * latencies.
6884 */
6885 sched_tick_delta = MAX(sched_tick_delta, 1);
6886
6887 /* In the event interrupt latencies or platform
6888 * idle events that advanced the timebase resulted
6889 * in periods where no threads were dispatched,
6890 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6891 * iterations.
6892 */
6893 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6894
6895 sched_tick_last_abstime = sched_tick_ctime;
6896 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6897 }
6898
6899 scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6900 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6901 sched_tick_delta, late_time, 0, 0, 0);
6902
6903 /* Add a number of pseudo-ticks corresponding to the elapsed interval
6904 * This could be greater than 1 if substantial intervals where
6905 * all processors are idle occur, which rarely occurs in practice.
6906 */
6907
6908 sched_tick += sched_tick_delta;
6909
6910 update_vm_info();
6911
6912 /*
6913 * Compute various averages.
6914 */
6915 compute_averages(sched_tick_delta);
6916
6917 /*
6918 * Scan the run queues for threads which
6919 * may need to be updated, and find the earliest runnable thread on the runqueue
6920 * to report its latency.
6921 */
6922 SCHED(thread_update_scan)(&scan_context);
6923
6924 SCHED(rt_runq_scan)(&scan_context);
6925
6926 uint64_t ctime = mach_absolute_time();
6927
6928 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6929 ctime - scan_context.earliest_bg_make_runnable_time : 0;
6930
6931 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6932 ctime - scan_context.earliest_normal_make_runnable_time : 0;
6933
6934 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6935 ctime - scan_context.earliest_rt_make_runnable_time : 0;
6936
6937 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6938
6939 /*
6940 * Check to see if the special sched VM group needs attention.
6941 */
6942 sched_vm_group_maintenance();
6943
6944 #if __arm64__
6945 /* Check to see if the recommended cores failsafe is active */
6946 sched_recommended_cores_maintenance();
6947 #endif /* __arm64__ */
6948
6949
6950 #if DEBUG || DEVELOPMENT
6951 #if __x86_64__
6952 #include <i386/misc_protos.h>
6953 /* Check for long-duration interrupts */
6954 mp_interrupt_watchdog();
6955 #endif /* __x86_64__ */
6956 #endif /* DEBUG || DEVELOPMENT */
6957
6958 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6959 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6960 sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6961
6962 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6963 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
6964 /*NOTREACHED*/
6965 }
6966
6967 static uint64_t sched_maintenance_wakeups;
6968
6969 /*
6970 * Determine if the set of routines formerly driven by a maintenance timer
6971 * must be invoked, based on a deadline comparison. Signals the scheduler
6972 * maintenance thread on deadline expiration. Must be invoked at an interval
6973 * lower than the "sched_tick_interval", currently accomplished by
6974 * invocation via the quantum expiration timer and at context switch time.
6975 * Performance matters: this routine reuses a timestamp approximating the
6976 * current absolute time received from the caller, and should perform
6977 * no more than a comparison against the deadline in the common case.
6978 */
6979 void
sched_timeshare_consider_maintenance(uint64_t ctime,bool safe_point)6980 sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point)
6981 {
6982 uint64_t deadline = sched_maintenance_deadline;
6983
6984 if (__improbable(ctime >= deadline)) {
6985 if (__improbable(current_thread() == sched_maintenance_thread)) {
6986 return;
6987 }
6988 OSMemoryBarrier();
6989
6990 uint64_t ndeadline = ctime + sched_tick_interval;
6991
6992 if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6993 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
6994 sched_maintenance_wakeups++;
6995 smr_maintenance(ctime);
6996 }
6997 }
6998
6999 smr_cpu_tick(ctime, safe_point);
7000
7001 #if !CONFIG_SCHED_CLUTCH
7002 /*
7003 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
7004 * scheduler, the load is maintained at the thread group and bucket level.
7005 */
7006 uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
7007
7008 if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
7009 uint64_t new_deadline = 0;
7010 if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
7011 compute_sched_load();
7012 new_deadline = ctime + sched_load_compute_interval_abs;
7013 os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
7014 }
7015 }
7016 #endif /* CONFIG_SCHED_CLUTCH */
7017
7018 #if __arm64__
7019 uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
7020
7021 if (__improbable(perf_deadline && ctime >= perf_deadline)) {
7022 /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
7023 if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
7024 machine_perfcontrol_deadline_passed(perf_deadline);
7025 }
7026 }
7027 #endif /* __arm64__ */
7028 }
7029
7030 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7031
7032 void
sched_init_thread(void)7033 sched_init_thread(void)
7034 {
7035 thread_block(THREAD_CONTINUE_NULL);
7036
7037 thread_t thread = current_thread();
7038
7039 thread_set_thread_name(thread, "sched_maintenance_thread");
7040
7041 sched_maintenance_thread = thread;
7042
7043 SCHED(maintenance_continuation)();
7044
7045 /*NOTREACHED*/
7046 }
7047
7048 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
7049
7050 /*
7051 * thread_update_scan / runq_scan:
7052 *
7053 * Scan the run queues to account for timesharing threads
7054 * which need to be updated.
7055 *
7056 * Scanner runs in two passes. Pass one squirrels likely
7057 * threads away in an array, pass two does the update.
7058 *
7059 * This is necessary because the run queue is locked for
7060 * the candidate scan, but the thread is locked for the update.
7061 *
7062 * Array should be sized to make forward progress, without
7063 * disabling preemption for long periods.
7064 */
7065
7066 #define THREAD_UPDATE_SIZE 128
7067
7068 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
7069 static uint32_t thread_update_count = 0;
7070
7071 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
7072 boolean_t
thread_update_add_thread(thread_t thread)7073 thread_update_add_thread(thread_t thread)
7074 {
7075 if (thread_update_count == THREAD_UPDATE_SIZE) {
7076 return FALSE;
7077 }
7078
7079 thread_update_array[thread_update_count++] = thread;
7080 thread_reference(thread);
7081 return TRUE;
7082 }
7083
7084 void
thread_update_process_threads(void)7085 thread_update_process_threads(void)
7086 {
7087 assert(thread_update_count <= THREAD_UPDATE_SIZE);
7088
7089 for (uint32_t i = 0; i < thread_update_count; i++) {
7090 thread_t thread = thread_update_array[i];
7091 assert_thread_magic(thread);
7092 thread_update_array[i] = THREAD_NULL;
7093
7094 spl_t s = splsched();
7095 thread_lock(thread);
7096 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
7097 SCHED(update_priority)(thread);
7098 }
7099 thread_unlock(thread);
7100 splx(s);
7101
7102 thread_deallocate(thread);
7103 }
7104
7105 thread_update_count = 0;
7106 }
7107
7108 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)7109 runq_scan_thread(
7110 thread_t thread,
7111 sched_update_scan_context_t scan_context)
7112 {
7113 assert_thread_magic(thread);
7114
7115 if (thread->sched_stamp != sched_tick &&
7116 thread->sched_mode == TH_MODE_TIMESHARE) {
7117 if (thread_update_add_thread(thread) == FALSE) {
7118 return TRUE;
7119 }
7120 }
7121
7122 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
7123 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
7124 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
7125 }
7126 } else {
7127 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
7128 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
7129 }
7130 }
7131
7132 return FALSE;
7133 }
7134
7135 /*
7136 * Scan a runq for candidate threads.
7137 *
7138 * Returns TRUE if retry is needed.
7139 */
7140 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)7141 runq_scan(
7142 run_queue_t runq,
7143 sched_update_scan_context_t scan_context)
7144 {
7145 int count = runq->count;
7146 int queue_index;
7147
7148 assert(count >= 0);
7149
7150 if (count == 0) {
7151 return FALSE;
7152 }
7153
7154 for (queue_index = bitmap_first(runq->bitmap, NRQS);
7155 queue_index >= 0;
7156 queue_index = bitmap_next(runq->bitmap, queue_index)) {
7157 thread_t thread;
7158 circle_queue_t queue = &runq->queues[queue_index];
7159
7160 cqe_foreach_element(thread, queue, runq_links) {
7161 assert(count > 0);
7162 if (runq_scan_thread(thread, scan_context) == TRUE) {
7163 return TRUE;
7164 }
7165 count--;
7166 }
7167 }
7168
7169 return FALSE;
7170 }
7171
7172 #if CONFIG_SCHED_CLUTCH
7173
7174 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)7175 sched_clutch_timeshare_scan(
7176 queue_t thread_queue,
7177 uint16_t thread_count,
7178 sched_update_scan_context_t scan_context)
7179 {
7180 if (thread_count == 0) {
7181 return FALSE;
7182 }
7183
7184 thread_t thread;
7185 qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
7186 if (runq_scan_thread(thread, scan_context) == TRUE) {
7187 return TRUE;
7188 }
7189 thread_count--;
7190 }
7191
7192 assert(thread_count == 0);
7193 return FALSE;
7194 }
7195
7196
7197 #endif /* CONFIG_SCHED_CLUTCH */
7198
7199 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7200
7201 bool
thread_is_eager_preempt(thread_t thread)7202 thread_is_eager_preempt(thread_t thread)
7203 {
7204 return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
7205 }
7206
7207 void
thread_set_eager_preempt(thread_t thread)7208 thread_set_eager_preempt(thread_t thread)
7209 {
7210 spl_t s = splsched();
7211 thread_lock(thread);
7212
7213 assert(!thread_is_eager_preempt(thread));
7214
7215 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
7216
7217 if (thread == current_thread()) {
7218 /* csw_check updates current_is_eagerpreempt on the processor */
7219 ast_t ast = csw_check(thread, current_processor(), AST_NONE);
7220
7221 thread_unlock(thread);
7222
7223 if (ast != AST_NONE) {
7224 thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
7225 }
7226 } else {
7227 processor_t last_processor = thread->last_processor;
7228
7229 if (last_processor != PROCESSOR_NULL &&
7230 last_processor->state == PROCESSOR_RUNNING &&
7231 last_processor->active_thread == thread) {
7232 cause_ast_check(last_processor);
7233 }
7234
7235 thread_unlock(thread);
7236 }
7237
7238 splx(s);
7239 }
7240
7241 void
thread_clear_eager_preempt(thread_t thread)7242 thread_clear_eager_preempt(thread_t thread)
7243 {
7244 spl_t s = splsched();
7245 thread_lock(thread);
7246
7247 assert(thread_is_eager_preempt(thread));
7248
7249 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
7250
7251 if (thread == current_thread()) {
7252 current_processor()->current_is_eagerpreempt = false;
7253 }
7254
7255 thread_unlock(thread);
7256 splx(s);
7257 }
7258
7259 /*
7260 * Scheduling statistics
7261 */
7262 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)7263 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
7264 {
7265 struct sched_statistics *stats;
7266 boolean_t to_realtime = FALSE;
7267
7268 stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
7269 stats->csw_count++;
7270
7271 if (otherpri >= BASEPRI_REALTIME) {
7272 stats->rt_sched_count++;
7273 to_realtime = TRUE;
7274 }
7275
7276 if ((reasons & AST_PREEMPT) != 0) {
7277 stats->preempt_count++;
7278
7279 if (selfpri >= BASEPRI_REALTIME) {
7280 stats->preempted_rt_count++;
7281 }
7282
7283 if (to_realtime) {
7284 stats->preempted_by_rt_count++;
7285 }
7286 }
7287 }
7288
7289 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)7290 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
7291 {
7292 uint64_t timestamp = mach_absolute_time();
7293
7294 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
7295 stats->last_change_timestamp = timestamp;
7296 }
7297
7298 /*
7299 * For calls from assembly code
7300 */
7301 #undef thread_wakeup
7302 void
7303 thread_wakeup(
7304 event_t x);
7305
7306 void
thread_wakeup(event_t x)7307 thread_wakeup(
7308 event_t x)
7309 {
7310 thread_wakeup_with_result(x, THREAD_AWAKENED);
7311 }
7312
7313 boolean_t
preemption_enabled(void)7314 preemption_enabled(void)
7315 {
7316 return get_preemption_level() == 0 && ml_get_interrupts_enabled();
7317 }
7318
7319 static void
sched_timer_deadline_tracking_init(void)7320 sched_timer_deadline_tracking_init(void)
7321 {
7322 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
7323 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
7324 }
7325
7326 static uint64_t latest_requested_powered_cores = ALL_CORES_POWERED;
7327 processor_reason_t latest_requested_reason = REASON_NONE;
7328 static uint64_t current_requested_powered_cores = ALL_CORES_POWERED;
7329 bool perfcontrol_sleep_override = false;
7330
7331 LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown");
7332 LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp);
7333 int32_t cluster_powerdown_suspend_count = 0;
7334
7335 bool
sched_is_in_sleep(void)7336 sched_is_in_sleep(void)
7337 {
7338 os_atomic_thread_fence(acquire);
7339 return perfcontrol_sleep_override;
7340 }
7341
7342 static void
sched_update_powered_cores_continue(void)7343 sched_update_powered_cores_continue(void)
7344 {
7345 lck_mtx_lock(&cluster_powerdown_lock);
7346
7347 if (!cluster_powerdown_suspend_count) {
7348 spl_t s = splsched();
7349 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7350
7351 uint64_t latest = latest_requested_powered_cores;
7352 processor_reason_t reason = latest_requested_reason;
7353 uint64_t current = current_requested_powered_cores;
7354 current_requested_powered_cores = latest;
7355 bool in_sleep = perfcontrol_sleep_override;
7356
7357 simple_unlock(&sched_available_cores_lock);
7358 splx(s);
7359
7360 while (latest != current) {
7361 if (!in_sleep) {
7362 assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7363 sched_update_powered_cores(latest, reason, SHUTDOWN_TEMPORARY | WAIT_FOR_LAST_START);
7364 }
7365
7366 s = splsched();
7367 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7368
7369 latest = latest_requested_powered_cores;
7370 reason = latest_requested_reason;
7371 current = current_requested_powered_cores;
7372 current_requested_powered_cores = latest;
7373 in_sleep = perfcontrol_sleep_override;
7374
7375 simple_unlock(&sched_available_cores_lock);
7376 splx(s);
7377 }
7378
7379 assert_wait((event_t)sched_update_powered_cores_continue, THREAD_UNINT);
7380
7381 s = splsched();
7382 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7383 if (latest_requested_powered_cores != current_requested_powered_cores) {
7384 clear_wait(current_thread(), THREAD_AWAKENED);
7385 }
7386 simple_unlock(&sched_available_cores_lock);
7387 splx(s);
7388 }
7389
7390 lck_mtx_unlock(&cluster_powerdown_lock);
7391
7392 thread_block((thread_continue_t)sched_update_powered_cores_continue);
7393 /*NOTREACHED*/
7394 }
7395
7396 void
sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,__unused uint32_t flags)7397 sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, __unused uint32_t flags)
7398 {
7399 assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7400
7401 #if DEVELOPMENT || DEBUG
7402 if (flags & (ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED)) {
7403 if (flags & ASSERT_POWERDOWN_SUSPENDED) {
7404 assert(cluster_powerdown_suspend_count > 0);
7405 }
7406 if (flags & ASSERT_IN_SLEEP) {
7407 assert(perfcontrol_sleep_override == true);
7408 }
7409 return;
7410 }
7411 #endif
7412
7413 spl_t s = splsched();
7414 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7415
7416 bool should_wakeup = !cluster_powerdown_suspend_count;
7417 if (should_wakeup) {
7418 latest_requested_powered_cores = requested_powered_cores;
7419 latest_requested_reason = reason;
7420 }
7421
7422 simple_unlock(&sched_available_cores_lock);
7423 splx(s);
7424
7425 if (should_wakeup) {
7426 thread_wakeup((event_t)sched_update_powered_cores_continue);
7427 }
7428 }
7429
7430 void
suspend_cluster_powerdown(void)7431 suspend_cluster_powerdown(void)
7432 {
7433 lck_mtx_lock(&cluster_powerdown_lock);
7434
7435 assert(cluster_powerdown_suspend_count >= 0);
7436
7437 bool first_suspend = (cluster_powerdown_suspend_count == 0);
7438 if (first_suspend) {
7439 spl_t s = splsched();
7440 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7441 latest_requested_powered_cores = ALL_CORES_POWERED;
7442 current_requested_powered_cores = ALL_CORES_POWERED;
7443 latest_requested_reason = REASON_SYSTEM;
7444 simple_unlock(&sched_available_cores_lock);
7445 splx(s);
7446 }
7447
7448 cluster_powerdown_suspend_count++;
7449
7450 if (first_suspend) {
7451 kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START)\n", __FUNCTION__);
7452 sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START);
7453 }
7454
7455 lck_mtx_unlock(&cluster_powerdown_lock);
7456 }
7457
7458 void
resume_cluster_powerdown(void)7459 resume_cluster_powerdown(void)
7460 {
7461 lck_mtx_lock(&cluster_powerdown_lock);
7462
7463 if (cluster_powerdown_suspend_count <= 0) {
7464 panic("resume_cluster_powerdown() called with cluster_powerdown_suspend_count=%d\n", cluster_powerdown_suspend_count);
7465 }
7466
7467 cluster_powerdown_suspend_count--;
7468
7469 bool last_resume = (cluster_powerdown_suspend_count == 0);
7470
7471 if (last_resume) {
7472 spl_t s = splsched();
7473 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7474 latest_requested_powered_cores = ALL_CORES_POWERED;
7475 current_requested_powered_cores = ALL_CORES_POWERED;
7476 latest_requested_reason = REASON_SYSTEM;
7477 simple_unlock(&sched_available_cores_lock);
7478 splx(s);
7479
7480 kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE)\n", __FUNCTION__);
7481 sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE);
7482 }
7483
7484 lck_mtx_unlock(&cluster_powerdown_lock);
7485 }
7486
7487 LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp);
7488 static bool user_suspended_cluster_powerdown = false;
7489
7490 kern_return_t
suspend_cluster_powerdown_from_user(void)7491 suspend_cluster_powerdown_from_user(void)
7492 {
7493 kern_return_t ret = KERN_FAILURE;
7494
7495 lck_mtx_lock(&user_cluster_powerdown_lock);
7496
7497 if (!user_suspended_cluster_powerdown) {
7498 suspend_cluster_powerdown();
7499 user_suspended_cluster_powerdown = true;
7500 ret = KERN_SUCCESS;
7501 }
7502
7503 lck_mtx_unlock(&user_cluster_powerdown_lock);
7504
7505 return ret;
7506 }
7507
7508 kern_return_t
resume_cluster_powerdown_from_user(void)7509 resume_cluster_powerdown_from_user(void)
7510 {
7511 kern_return_t ret = KERN_FAILURE;
7512
7513 lck_mtx_lock(&user_cluster_powerdown_lock);
7514
7515 if (user_suspended_cluster_powerdown) {
7516 resume_cluster_powerdown();
7517 user_suspended_cluster_powerdown = false;
7518 ret = KERN_SUCCESS;
7519 }
7520
7521 lck_mtx_unlock(&user_cluster_powerdown_lock);
7522
7523 return ret;
7524 }
7525
7526 int
get_cluster_powerdown_user_suspended(void)7527 get_cluster_powerdown_user_suspended(void)
7528 {
7529 lck_mtx_lock(&user_cluster_powerdown_lock);
7530
7531 int ret = (int)user_suspended_cluster_powerdown;
7532
7533 lck_mtx_unlock(&user_cluster_powerdown_lock);
7534
7535 return ret;
7536 }
7537
7538 #if DEVELOPMENT || DEBUG
7539 /* Functions to support the temporary sysctl */
7540 static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED;
7541 void
sched_set_powered_cores(int requested_powered_cores)7542 sched_set_powered_cores(int requested_powered_cores)
7543 {
7544 processor_reason_t reason = bit_test(requested_powered_cores, 31) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM;
7545 uint32_t flags = requested_powered_cores & 0x30000000;
7546
7547 saved_requested_powered_cores = requested_powered_cores;
7548
7549 requested_powered_cores = bits(requested_powered_cores, 28, 0);
7550
7551 sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags);
7552 }
7553 int
sched_get_powered_cores(void)7554 sched_get_powered_cores(void)
7555 {
7556 return (int)saved_requested_powered_cores;
7557 }
7558 #endif
7559
7560 /*
7561 * Ensure that all cores are powered and recommended before sleep
7562 */
7563 void
sched_override_available_cores_for_sleep(void)7564 sched_override_available_cores_for_sleep(void)
7565 {
7566 spl_t s = splsched();
7567 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7568
7569 if (perfcontrol_sleep_override == false) {
7570 perfcontrol_sleep_override = true;
7571 #if __arm__ || __arm64__
7572 sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7573 #endif
7574 }
7575
7576 simple_unlock(&sched_available_cores_lock);
7577 splx(s);
7578
7579 suspend_cluster_powerdown();
7580 }
7581
7582 /*
7583 * Restore the previously recommended cores, but leave all cores powered
7584 * after sleep
7585 */
7586 void
sched_restore_available_cores_after_sleep(void)7587 sched_restore_available_cores_after_sleep(void)
7588 {
7589 spl_t s = splsched();
7590 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7591
7592 if (perfcontrol_sleep_override == true) {
7593 perfcontrol_sleep_override = false;
7594 #if __arm__ || __arm64__
7595 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7596 REASON_NONE, 0);
7597 #endif
7598 }
7599
7600 simple_unlock(&sched_available_cores_lock);
7601 splx(s);
7602
7603 resume_cluster_powerdown();
7604 }
7605
7606 #if __arm__ || __arm64__
7607
7608 uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
7609 bool perfcontrol_failsafe_active = false;
7610
7611 uint64_t perfcontrol_failsafe_maintenance_runnable_time;
7612 uint64_t perfcontrol_failsafe_activation_time;
7613 uint64_t perfcontrol_failsafe_deactivation_time;
7614
7615 /* data covering who likely caused it and how long they ran */
7616 #define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */
7617 char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
7618 int perfcontrol_failsafe_pid;
7619 uint64_t perfcontrol_failsafe_tid;
7620 uint64_t perfcontrol_failsafe_thread_timer_at_start;
7621 uint64_t perfcontrol_failsafe_thread_timer_last_seen;
7622 uint64_t perfcontrol_failsafe_recommended_at_trigger;
7623
7624 /*
7625 * Perf controller calls here to update the recommended core bitmask.
7626 * If the failsafe is active, we don't immediately apply the new value.
7627 * Instead, we store the new request and use it after the failsafe deactivates.
7628 *
7629 * If the failsafe is not active, immediately apply the update.
7630 *
7631 * No scheduler locks are held, no other locks are held that scheduler might depend on,
7632 * interrupts are enabled
7633 *
7634 * currently prototype is in osfmk/arm/machine_routines.h
7635 */
7636 void
sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores,processor_reason_t reason,uint32_t flags)7637 sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags)
7638 {
7639 assert(preemption_enabled());
7640
7641 spl_t s = splsched();
7642 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7643
7644 if (reason == REASON_CLPC_SYSTEM) {
7645 perfcontrol_system_requested_recommended_cores = recommended_cores;
7646 } else {
7647 assert(reason == REASON_CLPC_USER);
7648 perfcontrol_user_requested_recommended_cores = recommended_cores;
7649 }
7650
7651 perfcontrol_requested_recommended_cores = perfcontrol_system_requested_recommended_cores & perfcontrol_user_requested_recommended_cores;
7652 perfcontrol_requested_recommended_core_count = __builtin_popcountll(perfcontrol_requested_recommended_cores);
7653
7654 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7655 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores, reason, flags);
7656 } else {
7657 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7658 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7659 perfcontrol_requested_recommended_cores,
7660 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7661 }
7662
7663 simple_unlock(&sched_available_cores_lock);
7664 splx(s);
7665 }
7666
7667 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)7668 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
7669 {
7670 sched_perfcontrol_update_recommended_cores_reason(recommended_cores, REASON_CLPC_USER, 0);
7671 }
7672
7673 /*
7674 * Consider whether we need to activate the recommended cores failsafe
7675 *
7676 * Called from quantum timer interrupt context of a realtime thread
7677 * No scheduler locks are held, interrupts are disabled
7678 */
7679 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)7680 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7681 {
7682 /*
7683 * Check if a realtime thread is starving the system
7684 * and bringing up non-recommended cores would help
7685 *
7686 * TODO: Is this the correct check for recommended == possible cores?
7687 * TODO: Validate the checks without the relevant lock are OK.
7688 */
7689
7690 if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7691 /* keep track of how long the responsible thread runs */
7692 uint64_t cur_th_time = recount_current_thread_time_mach();
7693
7694 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7695
7696 if (perfcontrol_failsafe_active == TRUE &&
7697 cur_thread->thread_id == perfcontrol_failsafe_tid) {
7698 perfcontrol_failsafe_thread_timer_last_seen = cur_th_time;
7699 }
7700
7701 simple_unlock(&sched_available_cores_lock);
7702
7703 /* we're already trying to solve the problem, so bail */
7704 return;
7705 }
7706
7707 /* The failsafe won't help if there are no more processors to enable */
7708 if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7709 return;
7710 }
7711
7712 uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7713
7714 /* Use the maintenance thread as our canary in the coal mine */
7715 thread_t m_thread = sched_maintenance_thread;
7716
7717 /* If it doesn't look bad, nothing to see here */
7718 if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7719 return;
7720 }
7721
7722 /* It looks bad, take the lock to be sure */
7723 thread_lock(m_thread);
7724
7725 if (thread_get_runq(m_thread) == PROCESSOR_NULL ||
7726 (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
7727 m_thread->last_made_runnable_time >= too_long_ago) {
7728 /*
7729 * Maintenance thread is either on cpu or blocked, and
7730 * therefore wouldn't benefit from more cores
7731 */
7732 thread_unlock(m_thread);
7733 return;
7734 }
7735
7736 uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7737
7738 thread_unlock(m_thread);
7739
7740 /*
7741 * There are cores disabled at perfcontrol's recommendation, but the
7742 * system is so overloaded that the maintenance thread can't run.
7743 * That likely means that perfcontrol can't run either, so it can't fix
7744 * the recommendation. We have to kick in a failsafe to keep from starving.
7745 *
7746 * When the maintenance thread has been starved for too long,
7747 * ignore the recommendation from perfcontrol and light up all the cores.
7748 *
7749 * TODO: Consider weird states like boot, sleep, or debugger
7750 */
7751
7752 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7753
7754 if (perfcontrol_failsafe_active == TRUE) {
7755 simple_unlock(&sched_available_cores_lock);
7756 return;
7757 }
7758
7759 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7760 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
7761 perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
7762
7763 perfcontrol_failsafe_active = TRUE;
7764 perfcontrol_failsafe_activation_time = mach_absolute_time();
7765 perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7766 perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7767
7768 /* Capture some data about who screwed up (assuming that the thread on core is at fault) */
7769 task_t task = get_threadtask(cur_thread);
7770 perfcontrol_failsafe_pid = task_pid(task);
7771 strlcpy(perfcontrol_failsafe_name, proc_name_address(get_bsdtask_info(task)), sizeof(perfcontrol_failsafe_name));
7772
7773 perfcontrol_failsafe_tid = cur_thread->thread_id;
7774
7775 /* Blame the thread for time it has run recently */
7776 uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7777
7778 uint64_t last_seen = recount_current_thread_time_mach();
7779
7780 /* Compute the start time of the bad behavior in terms of the thread's on core time */
7781 perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
7782 perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7783
7784 /* Ignore the previously recommended core configuration */
7785 sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7786
7787 simple_unlock(&sched_available_cores_lock);
7788 }
7789
7790 /*
7791 * Now that our bacon has been saved by the failsafe, consider whether to turn it off
7792 *
7793 * Runs in the context of the maintenance thread, no locks held
7794 */
7795 static void
sched_recommended_cores_maintenance(void)7796 sched_recommended_cores_maintenance(void)
7797 {
7798 /* Common case - no failsafe, nothing to be done here */
7799 if (__probable(perfcontrol_failsafe_active == FALSE)) {
7800 return;
7801 }
7802
7803 uint64_t ctime = mach_absolute_time();
7804
7805 boolean_t print_diagnostic = FALSE;
7806 char p_name[FAILSAFE_NAME_LEN] = "";
7807
7808 spl_t s = splsched();
7809 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7810
7811 /* Check again, under the lock, to avoid races */
7812 if (perfcontrol_failsafe_active == FALSE) {
7813 goto out;
7814 }
7815
7816 /*
7817 * Ensure that the other cores get another few ticks to run some threads
7818 * If we don't have this hysteresis, the maintenance thread is the first
7819 * to run, and then it immediately kills the other cores
7820 */
7821 if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7822 goto out;
7823 }
7824
7825 /* Capture some diagnostic state under the lock so we can print it out later */
7826
7827 int pid = perfcontrol_failsafe_pid;
7828 uint64_t tid = perfcontrol_failsafe_tid;
7829
7830 uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
7831 perfcontrol_failsafe_thread_timer_at_start;
7832 uint64_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
7833 uint64_t rec_cores_after = perfcontrol_requested_recommended_cores;
7834 uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
7835 strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
7836
7837 print_diagnostic = TRUE;
7838
7839 /* Deactivate the failsafe and reinstate the requested recommendation settings */
7840
7841 perfcontrol_failsafe_deactivation_time = ctime;
7842 perfcontrol_failsafe_active = FALSE;
7843
7844 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7845 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
7846 perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
7847
7848 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7849 REASON_NONE, 0);
7850
7851 out:
7852 simple_unlock(&sched_available_cores_lock);
7853 splx(s);
7854
7855 if (print_diagnostic) {
7856 uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
7857
7858 absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
7859 failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7860
7861 absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
7862 thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7863
7864 printf("recommended core failsafe kicked in for %lld ms "
7865 "likely due to %s[%d] thread 0x%llx spending "
7866 "%lld ms on cpu at realtime priority - "
7867 "new recommendation: 0x%llx -> 0x%llx\n",
7868 failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7869 rec_cores_before, rec_cores_after);
7870 }
7871 }
7872
7873 #endif /* __arm64__ */
7874
7875 kern_return_t
sched_processor_enable(processor_t processor,boolean_t enable)7876 sched_processor_enable(processor_t processor, boolean_t enable)
7877 {
7878 assert(preemption_enabled());
7879
7880 if (processor == master_processor) {
7881 /* The system can hang if this is allowed */
7882 return KERN_NOT_SUPPORTED;
7883 }
7884
7885 spl_t s = splsched();
7886 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7887
7888 if (enable) {
7889 bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7890 } else {
7891 bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7892 }
7893
7894 #if __arm64__
7895 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7896 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7897 REASON_USER, 0);
7898 } else {
7899 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7900 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7901 perfcontrol_requested_recommended_cores,
7902 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7903 }
7904 #else /* __arm64__ */
7905 sched_update_recommended_cores(usercontrol_requested_recommended_cores, REASON_USER, 0);
7906 #endif /* ! __arm64__ */
7907
7908 simple_unlock(&sched_available_cores_lock);
7909 splx(s);
7910
7911 return KERN_SUCCESS;
7912 }
7913
7914 void
sched_mark_processor_online_locked(processor_t processor,__assert_only processor_reason_t reason)7915 sched_mark_processor_online_locked(processor_t processor, __assert_only processor_reason_t reason)
7916 {
7917 assert((processor != master_processor) || (reason == REASON_SYSTEM));
7918
7919 bit_set(sched_online_processors, processor->cpu_id);
7920 }
7921
7922 kern_return_t
sched_mark_processor_offline(processor_t processor,processor_reason_t reason)7923 sched_mark_processor_offline(processor_t processor, processor_reason_t reason)
7924 {
7925 assert((processor != master_processor) || (reason == REASON_SYSTEM));
7926 kern_return_t ret = KERN_SUCCESS;
7927
7928 spl_t s = splsched();
7929 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7930
7931 if (reason == REASON_SYSTEM) {
7932 bit_clear(sched_online_processors, processor->cpu_id);
7933 simple_unlock(&sched_available_cores_lock);
7934 splx(s);
7935 return ret;
7936 }
7937
7938 uint64_t available_cores = sched_online_processors & perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores;
7939
7940 if (!bit_test(sched_online_processors, processor->cpu_id)) {
7941 /* Processor is already offline */
7942 ret = KERN_NOT_IN_SET;
7943 } else if (available_cores == BIT(processor->cpu_id)) {
7944 ret = KERN_RESOURCE_SHORTAGE;
7945 } else {
7946 bit_clear(sched_online_processors, processor->cpu_id);
7947 ret = KERN_SUCCESS;
7948 }
7949
7950 simple_unlock(&sched_available_cores_lock);
7951 splx(s);
7952
7953 return ret;
7954 }
7955
7956 /*
7957 * Apply a new recommended cores mask to the processors it affects
7958 * Runs after considering failsafes and such
7959 *
7960 * Iterate over processors and update their ->is_recommended field.
7961 * If a processor is running, we let it drain out at its next
7962 * quantum expiration or blocking point. If a processor is idle, there
7963 * may be more work for it to do, so IPI it.
7964 *
7965 * interrupts disabled, sched_available_cores_lock is held
7966 */
7967 static void
sched_update_recommended_cores(uint64_t recommended_cores,processor_reason_t reason,__unused uint32_t flags)7968 sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, __unused uint32_t flags)
7969 {
7970 uint64_t needs_exit_idle_mask = 0x0;
7971
7972 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
7973 recommended_cores,
7974 #if __arm64__
7975 perfcontrol_failsafe_active, 0, 0);
7976 #else /* __arm64__ */
7977 0, 0, 0);
7978 #endif /* ! __arm64__ */
7979
7980 if (__builtin_popcountll(recommended_cores & sched_online_processors) == 0) {
7981 bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
7982 }
7983
7984 /* First set recommended cores */
7985 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7986 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7987 processor_set_t pset = pset_array[pset_id];
7988
7989 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7990 cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7991
7992 if (newly_recommended == 0) {
7993 /* Nothing to do */
7994 continue;
7995 }
7996
7997 pset_lock(pset);
7998
7999 for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0; cpu_id = lsb_next(newly_recommended, cpu_id)) {
8000 processor_t processor = processor_array[cpu_id];
8001 processor->is_recommended = TRUE;
8002 processor->last_recommend_reason = reason;
8003 bit_set(pset->recommended_bitmask, processor->cpu_id);
8004
8005 if (processor->state == PROCESSOR_IDLE) {
8006 if (processor != current_processor()) {
8007 bit_set(needs_exit_idle_mask, processor->cpu_id);
8008 }
8009 }
8010 if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8011 os_atomic_inc(&processor_avail_count_user, relaxed);
8012 if (processor->processor_primary == processor) {
8013 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
8014 }
8015 SCHED(pset_made_schedulable)(processor, pset, false);
8016 }
8017 }
8018 pset_update_rt_stealable_state(pset);
8019
8020 pset_unlock(pset);
8021
8022 for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0;
8023 cpu_id = lsb_next(newly_recommended, cpu_id)) {
8024 smr_cpu_up(processor_array[cpu_id],
8025 SMR_CPU_REASON_IGNORED);
8026 }
8027 }
8028 }
8029
8030 /* Now shutdown not recommended cores */
8031 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8032 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8033 processor_set_t pset = pset_array[pset_id];
8034
8035 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
8036 cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
8037
8038 if (newly_unrecommended == 0) {
8039 /* Nothing to do */
8040 continue;
8041 }
8042
8043 pset_lock(pset);
8044
8045 for (int cpu_id = lsb_first(newly_unrecommended); cpu_id >= 0; cpu_id = lsb_next(newly_unrecommended, cpu_id)) {
8046 processor_t processor = processor_array[cpu_id];
8047 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
8048
8049 processor->is_recommended = FALSE;
8050 if (reason != REASON_NONE) {
8051 processor->last_derecommend_reason = reason;
8052 }
8053 bit_clear(pset->recommended_bitmask, processor->cpu_id);
8054 if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8055 os_atomic_dec(&processor_avail_count_user, relaxed);
8056 if (processor->processor_primary == processor) {
8057 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
8058 }
8059 }
8060 pset_update_rt_stealable_state(pset);
8061
8062 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
8063 ipi_type = SCHED_IPI_IMMEDIATE;
8064 }
8065 SCHED(processor_queue_shutdown)(processor);
8066 /* pset unlocked */
8067
8068 SCHED(rt_queue_shutdown)(processor);
8069
8070 if (ipi_type == SCHED_IPI_NONE) {
8071 /*
8072 * If the core is idle,
8073 * we can directly mark the processor
8074 * as "Ignored"
8075 *
8076 * Otherwise, smr will detect this
8077 * during smr_cpu_leave() when the
8078 * processor actually idles.
8079 */
8080 smr_cpu_down(processor, SMR_CPU_REASON_IGNORED);
8081 } else if (processor == current_processor()) {
8082 ast_on(AST_PREEMPT);
8083 } else {
8084 sched_ipi_perform(processor, ipi_type);
8085 }
8086
8087 pset_lock(pset);
8088 }
8089 pset_unlock(pset);
8090 }
8091 }
8092
8093 #if defined(__x86_64__)
8094 commpage_update_active_cpus();
8095 #endif
8096 /* Issue all pending IPIs now that the pset lock has been dropped */
8097 for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
8098 processor_t processor = processor_array[cpuid];
8099 machine_signal_idle(processor);
8100 }
8101
8102 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
8103 needs_exit_idle_mask, 0, 0, 0);
8104 }
8105
8106 static void
sched_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,uint32_t flags)8107 sched_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, uint32_t flags)
8108 {
8109 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_START,
8110 requested_powered_cores, reason, flags, 0);
8111
8112 assert((flags & (LOCK_STATE | UNLOCK_STATE)) ? (reason == REASON_SYSTEM) && (requested_powered_cores == ALL_CORES_POWERED) : 1);
8113
8114 /*
8115 * Loop through newly set requested_powered_cores and start them.
8116 * Loop through newly cleared requested_powered_cores and shut them down.
8117 */
8118
8119 if ((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER)) {
8120 flags |= SHUTDOWN_TEMPORARY;
8121 }
8122
8123 /* First set powered cores */
8124 cpumap_t started_cores = 0ull;
8125 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8126 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8127 processor_set_t pset = pset_array[pset_id];
8128
8129 spl_t s = splsched();
8130 pset_lock(pset);
8131 cpumap_t pset_requested_powered_cores = requested_powered_cores & pset->cpu_bitmask;
8132 cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8133 cpumap_t requested_changes = pset_requested_powered_cores ^ powered_cores;
8134 pset_unlock(pset);
8135 splx(s);
8136
8137 cpumap_t newly_powered = requested_changes & requested_powered_cores;
8138
8139 cpumap_t cpu_map = newly_powered;
8140
8141 if (flags & (LOCK_STATE | UNLOCK_STATE)) {
8142 /*
8143 * We need to change the lock state even if
8144 * we don't need to change the actual state.
8145 */
8146 cpu_map = pset_requested_powered_cores;
8147 /* But not the master_processor, which is always implicitly locked */
8148 bit_clear(cpu_map, master_processor->cpu_id);
8149 }
8150
8151 if (cpu_map == 0) {
8152 /* Nothing to do */
8153 continue;
8154 }
8155
8156 for (int cpu_id = lsb_first(cpu_map); cpu_id >= 0; cpu_id = lsb_next(cpu_map, cpu_id)) {
8157 processor_t processor = processor_array[cpu_id];
8158 processor_start_reason(processor, reason, flags);
8159 bit_set(started_cores, cpu_id);
8160 }
8161 }
8162 }
8163 if (flags & WAIT_FOR_LAST_START) {
8164 for (int cpu_id = lsb_first(started_cores); cpu_id >= 0; cpu_id = lsb_next(started_cores, cpu_id)) {
8165 processor_t processor = processor_array[cpu_id];
8166 processor_wait_for_start(processor);
8167 }
8168 }
8169
8170 /* Now shutdown not powered cores */
8171 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8172 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8173 processor_set_t pset = pset_array[pset_id];
8174
8175 spl_t s = splsched();
8176 pset_lock(pset);
8177 cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8178 cpumap_t requested_changes = (requested_powered_cores & pset->cpu_bitmask) ^ powered_cores;
8179 pset_unlock(pset);
8180 splx(s);
8181
8182 cpumap_t newly_unpowered = requested_changes & ~requested_powered_cores;
8183
8184 if (newly_unpowered == 0) {
8185 /* Nothing to do */
8186 continue;
8187 }
8188
8189 for (int cpu_id = lsb_first(newly_unpowered); cpu_id >= 0; cpu_id = lsb_next(newly_unpowered, cpu_id)) {
8190 processor_t processor = processor_array[cpu_id];
8191
8192 processor_exit_reason(processor, reason, flags);
8193 }
8194 }
8195 }
8196
8197 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_END, 0, 0, 0, 0);
8198 }
8199
8200 void
thread_set_options(uint32_t thopt)8201 thread_set_options(uint32_t thopt)
8202 {
8203 spl_t x;
8204 thread_t t = current_thread();
8205
8206 x = splsched();
8207 thread_lock(t);
8208
8209 t->options |= thopt;
8210
8211 thread_unlock(t);
8212 splx(x);
8213 }
8214
8215 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)8216 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
8217 {
8218 thread->pending_block_hint = block_hint;
8219 }
8220
8221 uint32_t
qos_max_parallelism(int qos,uint64_t options)8222 qos_max_parallelism(int qos, uint64_t options)
8223 {
8224 return SCHED(qos_max_parallelism)(qos, options);
8225 }
8226
8227 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)8228 sched_qos_max_parallelism(__unused int qos, uint64_t options)
8229 {
8230 host_basic_info_data_t hinfo;
8231 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
8232
8233
8234 /*
8235 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
8236 * implement their own qos_max_parallelism() interfaces.
8237 */
8238 assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
8239
8240 /* Query the machine layer for core information */
8241 __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
8242 (host_info_t)&hinfo, &count);
8243 assert(kret == KERN_SUCCESS);
8244
8245 if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
8246 return hinfo.logical_cpu;
8247 } else {
8248 return hinfo.physical_cpu;
8249 }
8250 }
8251
8252 int sched_allow_NO_SMT_threads = 1;
8253 bool
thread_no_smt(thread_t thread)8254 thread_no_smt(thread_t thread)
8255 {
8256 return sched_allow_NO_SMT_threads &&
8257 (thread->bound_processor == PROCESSOR_NULL) &&
8258 ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
8259 }
8260
8261 bool
processor_active_thread_no_smt(processor_t processor)8262 processor_active_thread_no_smt(processor_t processor)
8263 {
8264 return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
8265 }
8266
8267 #if __arm64__
8268
8269 /*
8270 * Set up or replace old timer with new timer
8271 *
8272 * Returns true if canceled old timer, false if it did not
8273 */
8274 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)8275 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
8276 {
8277 /*
8278 * Exchange deadline for new deadline, if old deadline was nonzero,
8279 * then I cancelled the callback, otherwise I didn't
8280 */
8281
8282 return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
8283 relaxed) != 0;
8284 }
8285
8286 /*
8287 * Set global SFI window (in usec)
8288 */
8289 kern_return_t
sched_perfcontrol_sfi_set_window(uint64_t window_usecs)8290 sched_perfcontrol_sfi_set_window(uint64_t window_usecs)
8291 {
8292 kern_return_t ret = KERN_NOT_SUPPORTED;
8293 #if CONFIG_THREAD_GROUPS
8294 if (window_usecs == 0ULL) {
8295 ret = sfi_window_cancel();
8296 } else {
8297 ret = sfi_set_window(window_usecs);
8298 }
8299 #endif // CONFIG_THREAD_GROUPS
8300 return ret;
8301 }
8302
8303 /*
8304 * Set background and maintenance SFI class offtimes
8305 */
8306 kern_return_t
sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)8307 sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)
8308 {
8309 kern_return_t ret = KERN_NOT_SUPPORTED;
8310 #if CONFIG_THREAD_GROUPS
8311 if (offtime_usecs == 0ULL) {
8312 ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE);
8313 ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG);
8314 } else {
8315 ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs);
8316 ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs);
8317 }
8318 #endif // CONFIG_THREAD_GROUPS
8319 return ret;
8320 }
8321
8322 /*
8323 * Set utility SFI class offtime
8324 */
8325 kern_return_t
sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)8326 sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)
8327 {
8328 kern_return_t ret = KERN_NOT_SUPPORTED;
8329 #if CONFIG_THREAD_GROUPS
8330 if (offtime_usecs == 0ULL) {
8331 ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY);
8332 } else {
8333 ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs);
8334 }
8335 #endif // CONFIG_THREAD_GROUPS
8336 return ret;
8337 }
8338
8339 #endif /* __arm64__ */
8340
8341 #if CONFIG_SCHED_EDGE
8342
8343 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
8344
8345 /*
8346 * sched_edge_pset_running_higher_bucket()
8347 *
8348 * Routine to calculate cumulative running counts for each scheduling
8349 * bucket. This effectively lets the load calculation calculate if a
8350 * cluster is running any threads at a QoS lower than the thread being
8351 * migrated etc.
8352 */
8353
8354 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)8355 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
8356 {
8357 bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
8358
8359 /* Edge Scheduler Optimization */
8360 for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
8361 sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
8362 for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
8363 running_higher[bucket]++;
8364 }
8365 }
8366 }
8367
8368 /*
8369 * sched_update_pset_load_average()
8370 *
8371 * Updates the load average for each sched bucket for a cluster.
8372 * This routine must be called with the pset lock held.
8373 */
8374 void
sched_update_pset_load_average(processor_set_t pset,uint64_t curtime)8375 sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
8376 {
8377 int avail_cpu_count = pset_available_cpu_count(pset);
8378 if (avail_cpu_count == 0) {
8379 /* Looks like the pset is not runnable any more; nothing to do here */
8380 return;
8381 }
8382
8383 /*
8384 * Edge Scheduler Optimization
8385 *
8386 * See if more callers of this routine can pass in timestamps to avoid the
8387 * mach_absolute_time() call here.
8388 */
8389
8390 if (!curtime) {
8391 curtime = mach_absolute_time();
8392 }
8393 uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
8394 int64_t delta_ticks = curtime - last_update;
8395 if (delta_ticks < 0) {
8396 return;
8397 }
8398
8399 uint64_t delta_nsecs = 0;
8400 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8401
8402 if (__improbable(delta_nsecs > UINT32_MAX)) {
8403 delta_nsecs = UINT32_MAX;
8404 }
8405
8406 #if CONFIG_SCHED_EDGE
8407 /* Update the shared resource load on the pset */
8408 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
8409 uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
8410 uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
8411 uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
8412 uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
8413 if (old_shared_load != new_shared_load) {
8414 KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
8415 }
8416 }
8417 #endif /* CONFIG_SCHED_EDGE */
8418
8419 uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
8420 sched_edge_pset_running_higher_bucket(pset, running_higher);
8421
8422 for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
8423 uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
8424 uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
8425 uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
8426
8427 /*
8428 * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
8429 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
8430 * new load averga needs to be shifted before it can be added to the old load average.
8431 */
8432 uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
8433
8434 /*
8435 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
8436 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
8437 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
8438 */
8439 int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8440 boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
8441 boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
8442 uint64_t load_average;
8443 if (load_uptick || load_downtick) {
8444 load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8445 } else {
8446 /* Indicates a loaded system; use EWMA for load average calculation */
8447 load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8448 }
8449 os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
8450 if (load_average != old_load_average) {
8451 KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
8452 }
8453 }
8454 os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
8455 }
8456
8457 void
sched_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)8458 sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
8459 {
8460 pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
8461 uint64_t avg_thread_execution_time = 0;
8462
8463 os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
8464 old_execution_time_packed.pset_execution_time_packed,
8465 new_execution_time_packed.pset_execution_time_packed, relaxed, {
8466 uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
8467 int64_t delta_ticks = curtime - last_update;
8468 if (delta_ticks < 0) {
8469 /*
8470 * Its possible that another CPU came in and updated the pset_execution_time
8471 * before this CPU could do it. Since the average execution time is meant to
8472 * be an approximate measure per cluster, ignore the older update.
8473 */
8474 os_atomic_rmw_loop_give_up(return );
8475 }
8476 uint64_t delta_nsecs = 0;
8477 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8478
8479 uint64_t nanotime = 0;
8480 absolutetime_to_nanoseconds(execution_time, &nanotime);
8481 uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
8482
8483 uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
8484 uint64_t new_execution_time = (execution_time_us * delta_nsecs);
8485
8486 avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8487 new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
8488 new_execution_time_packed.pset_execution_time_last_update = curtime;
8489 });
8490 if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
8491 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
8492 }
8493 }
8494
8495 uint64_t
sched_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)8496 sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
8497 {
8498 return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
8499 }
8500
8501 #else /* CONFIG_SCHED_EDGE */
8502
8503 void
sched_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)8504 sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
8505 {
8506 int non_rt_load = pset->pset_runq.count;
8507 int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
8508 int new_load_average = ((int)pset->load_average + load) >> 1;
8509
8510 pset->load_average = new_load_average;
8511 #if (DEVELOPMENT || DEBUG)
8512 #if __AMP__
8513 if (pset->pset_cluster_type == PSET_AMP_P) {
8514 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
8515 }
8516 #endif
8517 #endif
8518 }
8519
8520 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)8521 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
8522 {
8523 }
8524
8525 #endif /* CONFIG_SCHED_EDGE */
8526
8527 /* pset is locked */
8528 static bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)8529 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
8530 {
8531 int cpuid = processor->cpu_id;
8532 #if defined(__x86_64__)
8533 if (sched_avoid_cpu0 && (cpuid == 0)) {
8534 return false;
8535 }
8536 #endif
8537
8538 cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8539
8540 return bit_test(fasttrack_map, cpuid);
8541 }
8542
8543 /* pset is locked */
8544 static processor_t
choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)8545 choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
8546 {
8547 #if defined(__x86_64__)
8548 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8549 #else
8550 const bool avoid_cpu0 = false;
8551 #endif
8552 cpumap_t cpu_map;
8553
8554 try_again:
8555 cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8556 if (skip_processor) {
8557 bit_clear(cpu_map, skip_processor->cpu_id);
8558 }
8559 if (skip_spills) {
8560 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8561 }
8562
8563 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8564 bit_clear(cpu_map, 0);
8565 }
8566
8567 cpumap_t primary_map = cpu_map & pset->primary_map;
8568 if (avoid_cpu0) {
8569 primary_map = bit_ror64(primary_map, 1);
8570 }
8571
8572 int rotid = lsb_first(primary_map);
8573 if (rotid >= 0) {
8574 int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
8575
8576 processor_t processor = processor_array[cpuid];
8577
8578 return processor;
8579 }
8580
8581 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8582 goto out;
8583 }
8584
8585 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8586 /* Also avoid cpu1 */
8587 bit_clear(cpu_map, 1);
8588 }
8589
8590 /* Consider secondary processors whose primary is actually running a realtime thread */
8591 cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
8592 if (avoid_cpu0) {
8593 /* Also avoid cpu1 */
8594 secondary_map = bit_ror64(secondary_map, 2);
8595 }
8596 rotid = lsb_first(secondary_map);
8597 if (rotid >= 0) {
8598 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
8599
8600 processor_t processor = processor_array[cpuid];
8601
8602 return processor;
8603 }
8604
8605 /* Consider secondary processors */
8606 secondary_map = cpu_map & ~pset->primary_map;
8607 if (avoid_cpu0) {
8608 /* Also avoid cpu1 */
8609 secondary_map = bit_ror64(secondary_map, 2);
8610 }
8611 rotid = lsb_first(secondary_map);
8612 if (rotid >= 0) {
8613 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
8614
8615 processor_t processor = processor_array[cpuid];
8616
8617 return processor;
8618 }
8619
8620 /*
8621 * I was hoping the compiler would optimize
8622 * this away when avoid_cpu0 is const bool false
8623 * but it still complains about the assignmnent
8624 * in that case.
8625 */
8626 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8627 #if defined(__x86_64__)
8628 avoid_cpu0 = false;
8629 #else
8630 assert(0);
8631 #endif
8632 goto try_again;
8633 }
8634
8635 out:
8636 if (skip_processor) {
8637 return PROCESSOR_NULL;
8638 }
8639
8640 /*
8641 * If we didn't find an obvious processor to choose, but there are still more CPUs
8642 * not already running realtime threads than realtime threads in the realtime run queue,
8643 * this thread belongs in this pset, so choose some other processor in this pset
8644 * to ensure the thread is enqueued here.
8645 */
8646 cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8647 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8648 cpu_map = non_realtime_map;
8649 assert(cpu_map != 0);
8650 int cpuid = bit_first(cpu_map);
8651 assert(cpuid >= 0);
8652 return processor_array[cpuid];
8653 }
8654
8655 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8656 goto skip_secondaries;
8657 }
8658
8659 non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
8660 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8661 cpu_map = non_realtime_map;
8662 assert(cpu_map != 0);
8663 int cpuid = bit_first(cpu_map);
8664 assert(cpuid >= 0);
8665 return processor_array[cpuid];
8666 }
8667
8668 skip_secondaries:
8669 return PROCESSOR_NULL;
8670 }
8671
8672 /*
8673 * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
8674 * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
8675 *
8676 * pset is locked.
8677 */
8678 static processor_t
choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)8679 choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
8680 {
8681 uint64_t furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon);
8682 processor_t fd_processor = PROCESSOR_NULL;
8683 int lowest_priority = max_pri;
8684
8685 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
8686 if (skip_processor) {
8687 bit_clear(cpu_map, skip_processor->cpu_id);
8688 }
8689 if (skip_spills) {
8690 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8691 }
8692
8693 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8694 processor_t processor = processor_array[cpuid];
8695
8696 if (processor->current_pri > lowest_priority) {
8697 continue;
8698 }
8699
8700 if (processor->current_pri < lowest_priority) {
8701 lowest_priority = processor->current_pri;
8702 furthest_deadline = processor->deadline;
8703 fd_processor = processor;
8704 continue;
8705 }
8706
8707 if (processor->deadline > furthest_deadline) {
8708 furthest_deadline = processor->deadline;
8709 fd_processor = processor;
8710 }
8711 }
8712
8713 if (fd_processor) {
8714 return fd_processor;
8715 }
8716
8717 /*
8718 * There is a race condition possible when there are multiple processor sets.
8719 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
8720 * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
8721 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
8722 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
8723 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
8724 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
8725 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
8726 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
8727 *
8728 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
8729 * on the run queue of that pset.
8730 */
8731 if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
8732 cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
8733 assert(skip_processor == PROCESSOR_NULL);
8734 assert(skip_spills == false);
8735
8736 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8737 processor_t processor = processor_array[cpuid];
8738
8739 if (processor->current_pri > lowest_priority) {
8740 continue;
8741 }
8742
8743 if (processor->current_pri < lowest_priority) {
8744 lowest_priority = processor->current_pri;
8745 furthest_deadline = processor->deadline;
8746 fd_processor = processor;
8747 continue;
8748 }
8749
8750 if (processor->deadline > furthest_deadline) {
8751 furthest_deadline = processor->deadline;
8752 fd_processor = processor;
8753 }
8754 }
8755 }
8756
8757 return fd_processor;
8758 }
8759
8760 /* pset is locked */
8761 static processor_t
choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)8762 choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
8763 {
8764 bool skip_spills = true;
8765 bool include_ast_urgent_pending_cpus = false;
8766
8767 processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
8768 if (next_processor != PROCESSOR_NULL) {
8769 return next_processor;
8770 }
8771
8772 next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
8773 return next_processor;
8774 }
8775
8776 #if defined(__x86_64__)
8777 /* pset is locked */
8778 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)8779 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
8780 {
8781 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8782 int nbackup_cpus = 0;
8783
8784 if (include_backups && rt_runq_is_low_latency(pset)) {
8785 nbackup_cpus = sched_rt_n_backup_processors;
8786 }
8787
8788 cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8789 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8790 bit_clear(cpu_map, 0);
8791 }
8792 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8793 }
8794
8795 /* pset is locked */
8796 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)8797 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
8798 {
8799 int nbackup_cpus = 0;
8800
8801 if (include_backups && rt_runq_is_low_latency(pset)) {
8802 nbackup_cpus = sched_rt_n_backup_processors;
8803 }
8804
8805 cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
8806 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8807 }
8808 #endif
8809
8810 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)8811 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
8812 {
8813 if (!processor->is_recommended) {
8814 return false;
8815 }
8816 bool ok_to_run_realtime_thread = true;
8817 #if defined(__x86_64__)
8818 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
8819 if (spill_pending) {
8820 return true;
8821 }
8822 if (processor->cpu_id == 0) {
8823 if (sched_avoid_cpu0 == 1) {
8824 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
8825 } else if (sched_avoid_cpu0 == 2) {
8826 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
8827 }
8828 } else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
8829 ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
8830 } else if (processor->processor_primary != processor) {
8831 ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
8832 }
8833 #else
8834 (void)pset;
8835 (void)processor;
8836 (void)as_backup;
8837 #endif
8838 return ok_to_run_realtime_thread;
8839 }
8840
8841 void
sched_pset_made_schedulable(__unused processor_t processor,processor_set_t pset,boolean_t drop_lock)8842 sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
8843 {
8844 if (drop_lock) {
8845 pset_unlock(pset);
8846 }
8847 }
8848
8849 void
thread_set_no_smt(bool set)8850 thread_set_no_smt(bool set)
8851 {
8852 if (!system_is_SMT) {
8853 /* Not a machine that supports SMT */
8854 return;
8855 }
8856
8857 thread_t thread = current_thread();
8858
8859 spl_t s = splsched();
8860 thread_lock(thread);
8861 if (set) {
8862 thread->sched_flags |= TH_SFLAG_NO_SMT;
8863 }
8864 thread_unlock(thread);
8865 splx(s);
8866 }
8867
8868 bool
thread_get_no_smt(void)8869 thread_get_no_smt(void)
8870 {
8871 return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8872 }
8873
8874 extern void task_set_no_smt(task_t);
8875 void
task_set_no_smt(task_t task)8876 task_set_no_smt(task_t task)
8877 {
8878 if (!system_is_SMT) {
8879 /* Not a machine that supports SMT */
8880 return;
8881 }
8882
8883 if (task == TASK_NULL) {
8884 task = current_task();
8885 }
8886
8887 task_lock(task);
8888 task->t_flags |= TF_NO_SMT;
8889 task_unlock(task);
8890 }
8891
8892 #if DEBUG || DEVELOPMENT
8893 extern void sysctl_task_set_no_smt(char no_smt);
8894 void
sysctl_task_set_no_smt(char no_smt)8895 sysctl_task_set_no_smt(char no_smt)
8896 {
8897 if (!system_is_SMT) {
8898 /* Not a machine that supports SMT */
8899 return;
8900 }
8901
8902 task_t task = current_task();
8903
8904 task_lock(task);
8905 if (no_smt == '1') {
8906 task->t_flags |= TF_NO_SMT;
8907 }
8908 task_unlock(task);
8909 }
8910
8911 extern char sysctl_task_get_no_smt(void);
8912 char
sysctl_task_get_no_smt(void)8913 sysctl_task_get_no_smt(void)
8914 {
8915 task_t task = current_task();
8916
8917 if (task->t_flags & TF_NO_SMT) {
8918 return '1';
8919 }
8920 return '0';
8921 }
8922 #endif /* DEVELOPMENT || DEBUG */
8923
8924
8925 __private_extern__ void
thread_bind_cluster_type(thread_t thread,char cluster_type,bool soft_bound)8926 thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8927 {
8928 #if __AMP__
8929 spl_t s = splsched();
8930 thread_lock(thread);
8931 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8932 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8933 if (soft_bound) {
8934 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8935 }
8936 switch (cluster_type) {
8937 case 'e':
8938 case 'E':
8939 if (pset0.pset_cluster_type == PSET_AMP_E) {
8940 thread->th_bound_cluster_id = pset0.pset_id;
8941 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8942 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8943 }
8944 break;
8945 case 'p':
8946 case 'P':
8947 if (pset0.pset_cluster_type == PSET_AMP_P) {
8948 thread->th_bound_cluster_id = pset0.pset_id;
8949 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8950 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8951 }
8952 break;
8953 default:
8954 break;
8955 }
8956 thread_unlock(thread);
8957 splx(s);
8958
8959 if (thread == current_thread()) {
8960 thread_block(THREAD_CONTINUE_NULL);
8961 }
8962 #else /* __AMP__ */
8963 (void)thread;
8964 (void)cluster_type;
8965 (void)soft_bound;
8966 #endif /* __AMP__ */
8967 }
8968
8969 extern uint32_t thread_bound_cluster_id(thread_t thread);
8970 uint32_t
thread_bound_cluster_id(thread_t thread)8971 thread_bound_cluster_id(thread_t thread)
8972 {
8973 return thread->th_bound_cluster_id;
8974 }
8975
8976 __private_extern__ kern_return_t
thread_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)8977 thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8978 {
8979 #if __AMP__
8980
8981 processor_set_t pset = NULL;
8982
8983 /* Treat binding to THREAD_BOUND_CLUSTER_NONE as a request to unbind. */
8984 if ((options & THREAD_UNBIND) || cluster_id == THREAD_BOUND_CLUSTER_NONE) {
8985 /* If the thread was actually not bound to some cluster, nothing to do here */
8986 if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8987 return KERN_SUCCESS;
8988 }
8989 } else {
8990 /* Validate the inputs for the bind case */
8991 int max_clusters = ml_get_cluster_count();
8992 if (cluster_id >= max_clusters) {
8993 /* Invalid cluster id */
8994 return KERN_INVALID_VALUE;
8995 }
8996 pset = pset_array[cluster_id];
8997 if (pset == NULL) {
8998 /* Cluster has not been initialized yet */
8999 return KERN_INVALID_VALUE;
9000 }
9001 if (options & THREAD_BIND_ELIGIBLE_ONLY) {
9002 if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
9003 /* Thread is not recommended for the cluster type */
9004 return KERN_INVALID_POLICY;
9005 }
9006 }
9007 }
9008
9009 spl_t s = splsched();
9010 thread_lock(thread);
9011
9012 /* Unbind the thread from its previous bound state */
9013 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
9014 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
9015
9016 if (options & THREAD_UNBIND) {
9017 /* Nothing more to do here */
9018 goto thread_bind_cluster_complete;
9019 }
9020
9021 if (options & THREAD_BIND_SOFT) {
9022 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
9023 }
9024 thread->th_bound_cluster_id = cluster_id;
9025
9026 thread_bind_cluster_complete:
9027 thread_unlock(thread);
9028 splx(s);
9029
9030 if (thread == current_thread()) {
9031 thread_block(THREAD_CONTINUE_NULL);
9032 }
9033 #else /* __AMP__ */
9034 (void)thread;
9035 (void)cluster_id;
9036 (void)options;
9037 #endif /* __AMP__ */
9038 return KERN_SUCCESS;
9039 }
9040
9041 #if DEVELOPMENT || DEBUG
9042 extern int32_t sysctl_get_bound_cpuid(void);
9043 int32_t
sysctl_get_bound_cpuid(void)9044 sysctl_get_bound_cpuid(void)
9045 {
9046 int32_t cpuid = -1;
9047 thread_t self = current_thread();
9048
9049 processor_t processor = self->bound_processor;
9050 if (processor == NULL) {
9051 cpuid = -1;
9052 } else {
9053 cpuid = processor->cpu_id;
9054 }
9055
9056 return cpuid;
9057 }
9058
9059 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
9060 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)9061 sysctl_thread_bind_cpuid(int32_t cpuid)
9062 {
9063 processor_t processor = PROCESSOR_NULL;
9064
9065 if (cpuid == -1) {
9066 goto unbind;
9067 }
9068
9069 if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
9070 return KERN_INVALID_VALUE;
9071 }
9072
9073 processor = processor_array[cpuid];
9074 if (processor == PROCESSOR_NULL) {
9075 return KERN_INVALID_VALUE;
9076 }
9077
9078 #if __AMP__
9079
9080 thread_t thread = current_thread();
9081
9082 if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
9083 if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
9084 /* Cannot hard-bind an already hard-cluster-bound thread */
9085 return KERN_NOT_SUPPORTED;
9086 }
9087 }
9088
9089 #endif /* __AMP__ */
9090
9091 unbind:
9092 thread_bind(processor);
9093
9094 thread_block(THREAD_CONTINUE_NULL);
9095 return KERN_SUCCESS;
9096 }
9097
9098 extern char sysctl_get_task_cluster_type(void);
9099 char
sysctl_get_task_cluster_type(void)9100 sysctl_get_task_cluster_type(void)
9101 {
9102 task_t task = current_task();
9103 processor_set_t pset_hint = task->pset_hint;
9104
9105 if (!pset_hint) {
9106 return '0';
9107 }
9108
9109 #if __AMP__
9110 if (pset_hint->pset_cluster_type == PSET_AMP_E) {
9111 return 'E';
9112 } else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
9113 return 'P';
9114 }
9115 #endif
9116
9117 return '0';
9118 }
9119
9120 #if __AMP__
9121 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)9122 find_pset_of_type(pset_cluster_type_t t)
9123 {
9124 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
9125 if (node->pset_cluster_type != t) {
9126 continue;
9127 }
9128
9129 processor_set_t pset = PROCESSOR_SET_NULL;
9130 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
9131 pset = pset_array[pset_id];
9132 /* Prefer one with recommended processsors */
9133 if (pset->recommended_bitmask != 0) {
9134 assert(pset->pset_cluster_type == t);
9135 return pset;
9136 }
9137 }
9138 /* Otherwise return whatever was found last */
9139 return pset;
9140 }
9141
9142 return PROCESSOR_SET_NULL;
9143 }
9144 #endif
9145
9146 extern void sysctl_task_set_cluster_type(char cluster_type);
9147 void
sysctl_task_set_cluster_type(char cluster_type)9148 sysctl_task_set_cluster_type(char cluster_type)
9149 {
9150 task_t task = current_task();
9151 processor_set_t pset_hint = PROCESSOR_SET_NULL;
9152
9153 #if __AMP__
9154 switch (cluster_type) {
9155 case 'e':
9156 case 'E':
9157 pset_hint = find_pset_of_type(PSET_AMP_E);
9158 break;
9159 case 'p':
9160 case 'P':
9161 pset_hint = find_pset_of_type(PSET_AMP_P);
9162 break;
9163 default:
9164 break;
9165 }
9166
9167 if (pset_hint) {
9168 task_lock(task);
9169 task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
9170 task->pset_hint = pset_hint;
9171 task_unlock(task);
9172
9173 thread_block(THREAD_CONTINUE_NULL);
9174 }
9175 #else
9176 (void)cluster_type;
9177 (void)task;
9178 (void)pset_hint;
9179 #endif
9180 }
9181
9182 /*
9183 * The quantum length used for Fixed and RT sched modes. In general the quantum
9184 * can vary - for example for background or QOS.
9185 */
9186 extern uint64_t sysctl_get_quantum_us(void);
9187 uint64_t
sysctl_get_quantum_us(void)9188 sysctl_get_quantum_us(void)
9189 {
9190 uint32_t quantum;
9191 uint64_t quantum_ns;
9192
9193 quantum = SCHED(initial_quantum_size)(THREAD_NULL);
9194 absolutetime_to_nanoseconds(quantum, &quantum_ns);
9195
9196 return quantum_ns / 1000;
9197 }
9198
9199 #endif /* DEVELOPMENT || DEBUG */
9200