1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_FREE_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67 #include <debug.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80
81 #include <machine/commpage.h>
82
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/debug.h>
90 #include <kern/macro_help.h>
91 #include <kern/machine.h>
92 #include <kern/misc_protos.h>
93 #if MONOTONIC
94 #include <kern/monotonic.h>
95 #endif /* MONOTONIC */
96 #include <kern/processor.h>
97 #include <kern/queue.h>
98 #include <kern/recount.h>
99 #include <kern/restartable.h>
100 #include <kern/sched.h>
101 #include <kern/sched_prim.h>
102 #include <kern/sfi.h>
103 #include <kern/syscall_subr.h>
104 #include <kern/task.h>
105 #include <kern/thread.h>
106 #include <kern/thread_group.h>
107 #include <kern/ledger.h>
108 #include <kern/timer_queue.h>
109 #include <kern/waitq.h>
110 #include <kern/policy_internal.h>
111 #include <kern/cpu_quiesce.h>
112
113 #include <vm/pmap.h>
114 #include <vm/vm_kern.h>
115 #include <vm/vm_map.h>
116 #include <vm/vm_pageout.h>
117
118 #include <mach/sdt.h>
119 #include <mach/mach_host.h>
120 #include <mach/host_info.h>
121
122 #include <sys/kdebug.h>
123 #include <kperf/kperf.h>
124 #include <kern/kpc.h>
125 #include <san/kasan.h>
126 #include <kern/pms.h>
127 #include <kern/host.h>
128 #include <stdatomic.h>
129 #include <os/atomic_private.h>
130
131 struct sched_statistics PERCPU_DATA(sched_stats);
132 bool sched_stats_active;
133
134 static uint64_t
deadline_add(uint64_t d,uint64_t e)135 deadline_add(uint64_t d, uint64_t e)
136 {
137 uint64_t sum;
138 return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
139 }
140
141 int
rt_runq_count(processor_set_t pset)142 rt_runq_count(processor_set_t pset)
143 {
144 return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
145 }
146
147 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)148 rt_runq_earliest_deadline(processor_set_t pset)
149 {
150 return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
151 }
152
153 static int
rt_runq_priority(processor_set_t pset)154 rt_runq_priority(processor_set_t pset)
155 {
156 pset_assert_locked(pset);
157 rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
158
159 bitmap_t *map = rt_run_queue->bitmap;
160 int i = bitmap_first(map, NRTQS);
161 assert(i < NRTQS);
162
163 if (i >= 0) {
164 return i + BASEPRI_RTQUEUES;
165 }
166
167 return i;
168 }
169
170 static thread_t rt_runq_first(rt_queue_t rt_runq);
171
172 #if DEBUG
173 static void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)174 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
175 {
176 bitmap_t *map = rt_run_queue->bitmap;
177
178 uint64_t earliest_deadline = RT_DEADLINE_NONE;
179 uint32_t constraint = RT_CONSTRAINT_NONE;
180 int ed_index = NOPRI;
181 int count = 0;
182 bool found_thread = false;
183
184 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
185 int i = pri - BASEPRI_RTQUEUES;
186 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
187 queue_t queue = &rt_runq->pri_queue;
188 queue_entry_t iter;
189 int n = 0;
190 uint64_t previous_deadline = 0;
191 qe_foreach(iter, queue) {
192 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
193 assert_thread_magic(iter_thread);
194 if (iter_thread == thread) {
195 found_thread = true;
196 }
197 assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
198 assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
199 assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
200 assert(previous_deadline <= iter_thread->realtime.deadline);
201 n++;
202 if (iter == queue_first(queue)) {
203 assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
204 assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
205 }
206 previous_deadline = iter_thread->realtime.deadline;
207 }
208 assert(n == rt_runq->pri_count);
209 if (n == 0) {
210 assert(bitmap_test(map, i) == false);
211 assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
212 assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
213 } else {
214 assert(bitmap_test(map, i) == true);
215 }
216 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
217 earliest_deadline = rt_runq->pri_earliest_deadline;
218 constraint = rt_runq->pri_constraint;
219 ed_index = i;
220 }
221 count += n;
222 }
223 assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
224 assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
225 assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
226 assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
227 if (thread) {
228 assert(found_thread);
229 }
230 }
231 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th)
232 #else
233 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0)
234 #endif
235
236 uint32_t rt_constraint_threshold;
237
238 static bool
rt_runq_is_low_latency(processor_set_t pset)239 rt_runq_is_low_latency(processor_set_t pset)
240 {
241 return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
242 }
243
244 #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
245 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
246
247 #define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
248 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
249
250 #define MAX_UNSAFE_RT_QUANTA 100
251
252 #define MAX_UNSAFE_FIXED_QUANTA 100
253
254 #if DEVELOPMENT || DEBUG
255 TUNABLE_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
256 TUNABLE_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
257 #else
258 TUNABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
259 TUNABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
260 #endif
261
262 #define MAX_POLL_QUANTA 2
263 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
264
265 #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
266 int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
267
268 uint64_t max_poll_computation;
269
270 uint64_t max_unsafe_rt_computation;
271 uint64_t max_unsafe_fixed_computation;
272 uint64_t sched_safe_rt_duration;
273 uint64_t sched_safe_fixed_duration;
274
275 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
276
277 uint32_t std_quantum;
278 uint32_t min_std_quantum;
279 uint32_t bg_quantum;
280
281 uint32_t std_quantum_us;
282 uint32_t bg_quantum_us;
283
284 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
285
286 uint32_t thread_depress_time;
287 uint32_t default_timeshare_computation;
288 uint32_t default_timeshare_constraint;
289
290 uint32_t max_rt_quantum;
291 uint32_t min_rt_quantum;
292
293 uint32_t rt_deadline_epsilon;
294
295 uint32_t rt_constraint_threshold;
296
297 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
298
299 unsigned sched_tick;
300 uint32_t sched_tick_interval;
301
302 /* Timeshare load calculation interval (15ms) */
303 uint32_t sched_load_compute_interval_us = 15000;
304 uint64_t sched_load_compute_interval_abs;
305 static _Atomic uint64_t sched_load_compute_deadline;
306
307 uint32_t sched_pri_shifts[TH_BUCKET_MAX];
308 uint32_t sched_fixed_shift;
309
310 uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
311
312 /* Allow foreground to decay past default to resolve inversions */
313 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
314 int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
315
316 /* Defaults for timer deadline profiling */
317 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
318 * 2ms */
319 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
320 * <= 5ms */
321
322 uint64_t timer_deadline_tracking_bin_1;
323 uint64_t timer_deadline_tracking_bin_2;
324
325 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
326
327 thread_t sched_maintenance_thread;
328
329 /* interrupts disabled lock to guard recommended cores state */
330 decl_simple_lock_data(, sched_available_cores_lock);
331 uint64_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
332 uint64_t perfcontrol_system_requested_recommended_cores = ALL_CORES_RECOMMENDED;
333 uint64_t perfcontrol_user_requested_recommended_cores = ALL_CORES_RECOMMENDED;
334 static uint64_t usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
335 static uint64_t sched_online_processors = 0;
336 static void sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags);
337 static void sched_update_powered_cores(uint64_t reqested_powered_cores, processor_reason_t reason, uint32_t flags);
338
339 #if __arm64__
340 static void sched_recommended_cores_maintenance(void);
341 uint64_t perfcontrol_failsafe_starvation_threshold;
342 extern char *proc_name_address(struct proc *p);
343 #endif /* __arm64__ */
344
345 uint64_t sched_one_second_interval;
346 boolean_t allow_direct_handoff = TRUE;
347
348 /* Forwards */
349
350 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
351
352 static void load_shift_init(void);
353 static void preempt_pri_init(void);
354
355 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
356
357 thread_t processor_idle(
358 thread_t thread,
359 processor_t processor);
360
361 static ast_t
362 csw_check_locked(
363 thread_t thread,
364 processor_t processor,
365 processor_set_t pset,
366 ast_t check_reason);
367
368 static void processor_setrun(
369 processor_t processor,
370 thread_t thread,
371 integer_t options);
372
373 static void
374 sched_realtime_timebase_init(void);
375
376 static void
377 sched_timer_deadline_tracking_init(void);
378
379 #if DEBUG
380 extern int debug_task;
381 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
382 #else
383 #define TLOG(a, fmt, args...) do {} while (0)
384 #endif
385
386 static processor_t
387 thread_bind_internal(
388 thread_t thread,
389 processor_t processor);
390
391 static void
392 sched_vm_group_maintenance(void);
393
394 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
395 int8_t sched_load_shifts[NRQS];
396 bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
397 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
398
399 /*
400 * Statically allocate a buffer to hold the longest possible
401 * scheduler description string, as currently implemented.
402 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
403 * to export to userspace via sysctl(3). If either version
404 * changes, update the other.
405 *
406 * Note that in addition to being an upper bound on the strings
407 * in the kernel, it's also an exact parameter to PE_get_default(),
408 * which interrogates the device tree on some platforms. That
409 * API requires the caller know the exact size of the device tree
410 * property, so we need both a legacy size (32) and the current size
411 * (48) to deal with old and new device trees. The device tree property
412 * is similarly padded to a fixed size so that the same kernel image
413 * can run on multiple devices with different schedulers configured
414 * in the device tree.
415 */
416 char sched_string[SCHED_STRING_MAX_LENGTH];
417
418 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
419
420 /* Global flag which indicates whether Background Stepper Context is enabled */
421 static int cpu_throttle_enabled = 1;
422
423 #if DEVELOPMENT || DEBUG
424 int enable_task_set_cluster_type = 0;
425 bool system_ecore_only = false;
426 #endif /* DEVELOPMENT || DEBUG */
427
428 void
sched_init(void)429 sched_init(void)
430 {
431 boolean_t direct_handoff = FALSE;
432 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
433
434 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
435 /* No boot-args, check in device tree */
436 if (!PE_get_default("kern.sched_pri_decay_limit",
437 &sched_pri_decay_band_limit,
438 sizeof(sched_pri_decay_band_limit))) {
439 /* Allow decay all the way to normal limits */
440 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
441 }
442 }
443
444 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
445
446 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
447 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
448 }
449 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
450
451 cpu_quiescent_counter_init();
452
453 SCHED(init)();
454 SCHED(rt_init)(&pset0);
455 sched_timer_deadline_tracking_init();
456
457 SCHED(pset_init)(&pset0);
458 SCHED(processor_init)(master_processor);
459
460 if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
461 allow_direct_handoff = direct_handoff;
462 }
463
464 #if DEVELOPMENT || DEBUG
465 if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
466 system_ecore_only = (enable_task_set_cluster_type == 2);
467 }
468 #endif /* DEVELOPMENT || DEBUG */
469
470 simple_lock_init(&sched_available_cores_lock, 0);
471 }
472
473 void
sched_timebase_init(void)474 sched_timebase_init(void)
475 {
476 uint64_t abstime;
477
478 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
479 sched_one_second_interval = abstime;
480
481 SCHED(timebase_init)();
482 sched_realtime_timebase_init();
483 }
484
485 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
486
487 void
sched_timeshare_init(void)488 sched_timeshare_init(void)
489 {
490 /*
491 * Calculate the timeslicing quantum
492 * in us.
493 */
494 if (default_preemption_rate < 1) {
495 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
496 }
497 std_quantum_us = (1000 * 1000) / default_preemption_rate;
498
499 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
500
501 if (default_bg_preemption_rate < 1) {
502 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
503 }
504 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
505
506 printf("standard background quantum is %d us\n", bg_quantum_us);
507
508 load_shift_init();
509 preempt_pri_init();
510 sched_tick = 0;
511 }
512
513 void
sched_set_max_unsafe_rt_quanta(int max)514 sched_set_max_unsafe_rt_quanta(int max)
515 {
516 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
517
518 max_unsafe_rt_computation = ((uint64_t)max) * quantum_size;
519 sched_safe_rt_duration = 2 * ((uint64_t)max) * quantum_size;
520
521 #if DEVELOPMENT || DEBUG
522 max_unsafe_rt_quanta = max;
523 #else
524 /*
525 * On RELEASE kernels, this is only called on boot where
526 * max is already equal to max_unsafe_rt_quanta.
527 */
528 assert3s(max, ==, max_unsafe_rt_quanta);
529 #endif
530 }
531
532 void
sched_set_max_unsafe_fixed_quanta(int max)533 sched_set_max_unsafe_fixed_quanta(int max)
534 {
535 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
536
537 max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size;
538 sched_safe_fixed_duration = 2 * ((uint64_t)max) * quantum_size;
539
540 #if DEVELOPMENT || DEBUG
541 max_unsafe_fixed_quanta = max;
542 #else
543 /*
544 * On RELEASE kernels, this is only called on boot where
545 * max is already equal to max_unsafe_fixed_quanta.
546 */
547 assert3s(max, ==, max_unsafe_fixed_quanta);
548 #endif
549 }
550
551 void
sched_timeshare_timebase_init(void)552 sched_timeshare_timebase_init(void)
553 {
554 uint64_t abstime;
555 uint32_t shift;
556
557 /* standard timeslicing quantum */
558 clock_interval_to_absolutetime_interval(
559 std_quantum_us, NSEC_PER_USEC, &abstime);
560 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
561 std_quantum = (uint32_t)abstime;
562
563 /* smallest remaining quantum (250 us) */
564 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
565 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
566 min_std_quantum = (uint32_t)abstime;
567
568 /* quantum for background tasks */
569 clock_interval_to_absolutetime_interval(
570 bg_quantum_us, NSEC_PER_USEC, &abstime);
571 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
572 bg_quantum = (uint32_t)abstime;
573
574 /* scheduler tick interval */
575 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
576 NSEC_PER_USEC, &abstime);
577 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
578 sched_tick_interval = (uint32_t)abstime;
579
580 /* timeshare load calculation interval & deadline initialization */
581 clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
582 os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
583
584 /*
585 * Compute conversion factor from usage to
586 * timesharing priorities with 5/8 ** n aging.
587 */
588 abstime = (abstime * 5) / 3;
589 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
590 abstime >>= 1;
591 }
592 sched_fixed_shift = shift;
593
594 for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
595 sched_pri_shifts[i] = INT8_MAX;
596 }
597
598 sched_set_max_unsafe_rt_quanta(max_unsafe_rt_quanta);
599 sched_set_max_unsafe_fixed_quanta(max_unsafe_fixed_quanta);
600
601 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
602 thread_depress_time = 1 * std_quantum;
603 default_timeshare_computation = std_quantum / 2;
604 default_timeshare_constraint = std_quantum;
605
606 #if __arm64__
607 perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
608 #endif /* __arm64__ */
609 }
610
611 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
612
613 void
pset_rt_init(processor_set_t pset)614 pset_rt_init(processor_set_t pset)
615 {
616 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
617 int i = pri - BASEPRI_RTQUEUES;
618 rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
619 queue_init(&rqi->pri_queue);
620 rqi->pri_count = 0;
621 rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
622 rqi->pri_constraint = RT_CONSTRAINT_NONE;
623 }
624 os_atomic_init(&pset->rt_runq.count, 0);
625 os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
626 os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
627 os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
628 memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
629 }
630
631 /* epsilon for comparing RT deadlines */
632 int rt_deadline_epsilon_us = 100;
633
634 int
sched_get_rt_deadline_epsilon(void)635 sched_get_rt_deadline_epsilon(void)
636 {
637 return rt_deadline_epsilon_us;
638 }
639
640 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)641 sched_set_rt_deadline_epsilon(int new_epsilon_us)
642 {
643 rt_deadline_epsilon_us = new_epsilon_us;
644
645 uint64_t abstime;
646 clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
647 assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
648 rt_deadline_epsilon = (uint32_t)abstime;
649 }
650
651 static void
sched_realtime_timebase_init(void)652 sched_realtime_timebase_init(void)
653 {
654 uint64_t abstime;
655
656 /* smallest rt computation (50 us) */
657 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
658 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
659 min_rt_quantum = (uint32_t)abstime;
660
661 /* maximum rt computation (50 ms) */
662 clock_interval_to_absolutetime_interval(
663 50, 1000 * NSEC_PER_USEC, &abstime);
664 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
665 max_rt_quantum = (uint32_t)abstime;
666
667 /* constraint threshold for sending backup IPIs (4 ms) */
668 clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
669 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
670 rt_constraint_threshold = (uint32_t)abstime;
671
672 /* epsilon for comparing deadlines */
673 sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
674 }
675
676 void
sched_check_spill(processor_set_t pset,thread_t thread)677 sched_check_spill(processor_set_t pset, thread_t thread)
678 {
679 (void)pset;
680 (void)thread;
681
682 return;
683 }
684
685 bool
sched_thread_should_yield(processor_t processor,thread_t thread)686 sched_thread_should_yield(processor_t processor, thread_t thread)
687 {
688 (void)thread;
689
690 return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
691 }
692
693 /* Default implementations of .steal_thread_enabled */
694 bool
sched_steal_thread_DISABLED(processor_set_t pset)695 sched_steal_thread_DISABLED(processor_set_t pset)
696 {
697 (void)pset;
698 return false;
699 }
700
701 bool
sched_steal_thread_enabled(processor_set_t pset)702 sched_steal_thread_enabled(processor_set_t pset)
703 {
704 return bit_count(pset->node->pset_map) > 1;
705 }
706
707 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
708
709 /*
710 * Set up values for timeshare
711 * loading factors.
712 */
713 static void
load_shift_init(void)714 load_shift_init(void)
715 {
716 int8_t k, *p = sched_load_shifts;
717 uint32_t i, j;
718
719 uint32_t sched_decay_penalty = 1;
720
721 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
722 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
723 }
724
725 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
726 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
727 }
728
729 if (sched_decay_penalty == 0) {
730 /*
731 * There is no penalty for timeshare threads for using too much
732 * CPU, so set all load shifts to INT8_MIN. Even under high load,
733 * sched_pri_shift will be >INT8_MAX, and there will be no
734 * penalty applied to threads (nor will sched_usage be updated per
735 * thread).
736 */
737 for (i = 0; i < NRQS; i++) {
738 sched_load_shifts[i] = INT8_MIN;
739 }
740
741 return;
742 }
743
744 *p++ = INT8_MIN; *p++ = 0;
745
746 /*
747 * For a given system load "i", the per-thread priority
748 * penalty per quantum of CPU usage is ~2^k priority
749 * levels. "sched_decay_penalty" can cause more
750 * array entries to be filled with smaller "k" values
751 */
752 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
753 for (j <<= 1; (i < j) && (i < NRQS); ++i) {
754 *p++ = k;
755 }
756 }
757 }
758
759 static void
preempt_pri_init(void)760 preempt_pri_init(void)
761 {
762 bitmap_t *p = sched_preempt_pri;
763
764 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
765 bitmap_set(p, i);
766 }
767
768 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
769 bitmap_set(p, i);
770 }
771 }
772
773 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
774
775 void
check_monotonic_time(uint64_t ctime)776 check_monotonic_time(uint64_t ctime)
777 {
778 processor_t processor = current_processor();
779 uint64_t last_dispatch = processor->last_dispatch;
780
781 if (last_dispatch > ctime) {
782 panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
783 last_dispatch, ctime);
784 }
785 }
786
787
788 /*
789 * Thread wait timer expiration.
790 */
791 void
thread_timer_expire(void * p0,__unused void * p1)792 thread_timer_expire(
793 void *p0,
794 __unused void *p1)
795 {
796 thread_t thread = p0;
797 spl_t s;
798
799 assert_thread_magic(thread);
800
801 s = splsched();
802 thread_lock(thread);
803 if (--thread->wait_timer_active == 0) {
804 if (thread->wait_timer_is_set) {
805 thread->wait_timer_is_set = FALSE;
806 clear_wait_internal(thread, THREAD_TIMED_OUT);
807 }
808 }
809 thread_unlock(thread);
810 splx(s);
811 }
812
813 /*
814 * thread_unblock:
815 *
816 * Unblock thread on wake up.
817 *
818 * Returns TRUE if the thread should now be placed on the runqueue.
819 *
820 * Thread must be locked.
821 *
822 * Called at splsched().
823 */
824 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)825 thread_unblock(
826 thread_t thread,
827 wait_result_t wresult)
828 {
829 boolean_t ready_for_runq = FALSE;
830 thread_t cthread = current_thread();
831 uint32_t new_run_count;
832 int old_thread_state;
833
834 /*
835 * Set wait_result.
836 */
837 thread->wait_result = wresult;
838
839 /*
840 * Cancel pending wait timer.
841 */
842 if (thread->wait_timer_is_set) {
843 if (timer_call_cancel(thread->wait_timer)) {
844 thread->wait_timer_active--;
845 }
846 thread->wait_timer_is_set = FALSE;
847 }
848
849 boolean_t aticontext, pidle;
850 ml_get_power_state(&aticontext, &pidle);
851
852 /*
853 * Update scheduling state: not waiting,
854 * set running.
855 */
856 old_thread_state = thread->state;
857 thread->state = (old_thread_state | TH_RUN) &
858 ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT);
859
860 if ((old_thread_state & TH_RUN) == 0) {
861 uint64_t ctime = mach_approximate_time();
862
863 check_monotonic_time(ctime);
864
865 thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
866 timer_start(&thread->runnable_timer, ctime);
867
868 ready_for_runq = TRUE;
869
870 if (old_thread_state & TH_WAIT_REPORT) {
871 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
872 }
873
874 /* Update the runnable thread count */
875 new_run_count = SCHED(run_count_incr)(thread);
876
877 #if CONFIG_SCHED_AUTO_JOIN
878 if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
879 work_interval_auto_join_propagate(cthread, thread);
880 }
881 #endif /*CONFIG_SCHED_AUTO_JOIN */
882 } else {
883 /*
884 * Either the thread is idling in place on another processor,
885 * or it hasn't finished context switching yet.
886 */
887 assert((thread->state & TH_IDLE) == 0);
888 /*
889 * The run count is only dropped after the context switch completes
890 * and the thread is still waiting, so we should not run_incr here
891 */
892 new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
893 }
894
895 /*
896 * Calculate deadline for real-time threads.
897 */
898 if (thread->sched_mode == TH_MODE_REALTIME) {
899 uint64_t ctime = mach_absolute_time();
900 thread->realtime.deadline = thread->realtime.constraint + ctime;
901 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
902 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
903 }
904
905 /*
906 * Clear old quantum, fail-safe computation, etc.
907 */
908 thread->quantum_remaining = 0;
909 thread->computation_metered = 0;
910 thread->reason = AST_NONE;
911 thread->block_hint = kThreadWaitNone;
912
913 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
914 * We also account for "double hop" thread signaling via
915 * the thread callout infrastructure.
916 * DRK: consider removing the callout wakeup counters in the future
917 * they're present for verification at the moment.
918 */
919
920 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
921 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
922
923 uint64_t ttd = current_processor()->timer_call_ttd;
924
925 if (ttd) {
926 if (ttd <= timer_deadline_tracking_bin_1) {
927 thread->thread_timer_wakeups_bin_1++;
928 } else if (ttd <= timer_deadline_tracking_bin_2) {
929 thread->thread_timer_wakeups_bin_2++;
930 }
931 }
932
933 ledger_credit_thread(thread, thread->t_ledger,
934 task_ledgers.interrupt_wakeups, 1);
935 if (pidle) {
936 ledger_credit_thread(thread, thread->t_ledger,
937 task_ledgers.platform_idle_wakeups, 1);
938 }
939 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
940 /* TODO: what about an interrupt that does a wake taken on a callout thread? */
941 if (cthread->callout_woken_from_icontext) {
942 ledger_credit_thread(thread, thread->t_ledger,
943 task_ledgers.interrupt_wakeups, 1);
944 thread->thread_callout_interrupt_wakeups++;
945
946 if (cthread->callout_woken_from_platform_idle) {
947 ledger_credit_thread(thread, thread->t_ledger,
948 task_ledgers.platform_idle_wakeups, 1);
949 thread->thread_callout_platform_idle_wakeups++;
950 }
951
952 cthread->callout_woke_thread = TRUE;
953 }
954 }
955
956 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
957 thread->callout_woken_from_icontext = !!aticontext;
958 thread->callout_woken_from_platform_idle = !!pidle;
959 thread->callout_woke_thread = FALSE;
960 }
961
962 #if KPERF
963 if (ready_for_runq) {
964 kperf_make_runnable(thread, aticontext);
965 }
966 #endif /* KPERF */
967
968 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
969 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
970 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
971 sched_run_buckets[TH_BUCKET_RUN], 0);
972
973 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
974
975 return ready_for_runq;
976 }
977
978 /*
979 * Routine: thread_allowed_for_handoff
980 * Purpose:
981 * Check if the thread is allowed for handoff operation
982 * Conditions:
983 * thread lock held, IPC locks may be held.
984 * TODO: In future, do not allow handoff if threads have different cluster
985 * recommendations.
986 */
987 boolean_t
thread_allowed_for_handoff(thread_t thread)988 thread_allowed_for_handoff(
989 thread_t thread)
990 {
991 thread_t self = current_thread();
992
993 if (allow_direct_handoff &&
994 thread->sched_mode == TH_MODE_REALTIME &&
995 self->sched_mode == TH_MODE_REALTIME) {
996 return TRUE;
997 }
998
999 return FALSE;
1000 }
1001
1002 /*
1003 * Routine: thread_go
1004 * Purpose:
1005 * Unblock and dispatch thread.
1006 * Conditions:
1007 * thread lock held, IPC locks may be held.
1008 * thread must have been pulled from wait queue under same lock hold.
1009 * thread must have been waiting
1010 * Returns:
1011 * KERN_SUCCESS - Thread was set running
1012 *
1013 * TODO: This should return void
1014 */
1015 kern_return_t
thread_go(thread_t thread,wait_result_t wresult,bool try_handoff)1016 thread_go(
1017 thread_t thread,
1018 wait_result_t wresult,
1019 bool try_handoff)
1020 {
1021 thread_t self = current_thread();
1022
1023 assert_thread_magic(thread);
1024
1025 assert(thread->at_safe_point == FALSE);
1026 assert(thread->wait_event == NO_EVENT64);
1027 assert(waitq_wait_possible(thread));
1028
1029 assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1030 assert(thread->state & TH_WAIT);
1031
1032
1033 if (thread_unblock(thread, wresult)) {
1034 #if SCHED_TRACE_THREAD_WAKEUPS
1035 backtrace(&thread->thread_wakeup_bt[0],
1036 (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1037 NULL);
1038 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1039 if (try_handoff && thread_allowed_for_handoff(thread)) {
1040 thread_reference(thread);
1041 assert(self->handoff_thread == NULL);
1042 self->handoff_thread = thread;
1043 } else {
1044 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1045 }
1046 }
1047
1048 return KERN_SUCCESS;
1049 }
1050
1051 /*
1052 * Routine: thread_mark_wait_locked
1053 * Purpose:
1054 * Mark a thread as waiting. If, given the circumstances,
1055 * it doesn't want to wait (i.e. already aborted), then
1056 * indicate that in the return value.
1057 * Conditions:
1058 * at splsched() and thread is locked.
1059 */
1060 __private_extern__
1061 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1062 thread_mark_wait_locked(
1063 thread_t thread,
1064 wait_interrupt_t interruptible_orig)
1065 {
1066 boolean_t at_safe_point;
1067 wait_interrupt_t interruptible = interruptible_orig;
1068
1069 if (thread->state & TH_IDLE) {
1070 panic("Invalid attempt to wait while running the idle thread");
1071 }
1072
1073 assert(!(thread->state & (TH_WAIT | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1074
1075 /*
1076 * The thread may have certain types of interrupts/aborts masked
1077 * off. Even if the wait location says these types of interrupts
1078 * are OK, we have to honor mask settings (outer-scoped code may
1079 * not be able to handle aborts at the moment).
1080 */
1081 interruptible &= TH_OPT_INTMASK;
1082 if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1083 interruptible = thread->options & TH_OPT_INTMASK;
1084 }
1085
1086 at_safe_point = (interruptible == THREAD_ABORTSAFE);
1087
1088 if (interruptible == THREAD_UNINT ||
1089 !(thread->sched_flags & TH_SFLAG_ABORT) ||
1090 (!at_safe_point &&
1091 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1092 if (!(thread->state & TH_TERMINATE)) {
1093 DTRACE_SCHED(sleep);
1094 }
1095
1096 int state_bits = TH_WAIT;
1097 if (!interruptible) {
1098 state_bits |= TH_UNINT;
1099 }
1100 if (thread->sched_call) {
1101 wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1102 if (is_kerneltask(get_threadtask(thread))) {
1103 mask = THREAD_WAIT_NOREPORT_KERNEL;
1104 }
1105 if ((interruptible_orig & mask) == 0) {
1106 state_bits |= TH_WAIT_REPORT;
1107 }
1108 }
1109 thread->state |= state_bits;
1110 thread->at_safe_point = at_safe_point;
1111
1112 /* TODO: pass this through assert_wait instead, have
1113 * assert_wait just take a struct as an argument */
1114 assert(!thread->block_hint);
1115 thread->block_hint = thread->pending_block_hint;
1116 thread->pending_block_hint = kThreadWaitNone;
1117
1118 return thread->wait_result = THREAD_WAITING;
1119 } else {
1120 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1121 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1122 }
1123 }
1124 thread->pending_block_hint = kThreadWaitNone;
1125
1126 return thread->wait_result = THREAD_INTERRUPTED;
1127 }
1128
1129 /*
1130 * Routine: thread_interrupt_level
1131 * Purpose:
1132 * Set the maximum interruptible state for the
1133 * current thread. The effective value of any
1134 * interruptible flag passed into assert_wait
1135 * will never exceed this.
1136 *
1137 * Useful for code that must not be interrupted,
1138 * but which calls code that doesn't know that.
1139 * Returns:
1140 * The old interrupt level for the thread.
1141 */
1142 __private_extern__
1143 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1144 thread_interrupt_level(
1145 wait_interrupt_t new_level)
1146 {
1147 thread_t thread = current_thread();
1148 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1149
1150 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1151
1152 return result;
1153 }
1154
1155 /*
1156 * assert_wait:
1157 *
1158 * Assert that the current thread is about to go to
1159 * sleep until the specified event occurs.
1160 */
1161 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1162 assert_wait(
1163 event_t event,
1164 wait_interrupt_t interruptible)
1165 {
1166 if (__improbable(event == NO_EVENT)) {
1167 panic("%s() called with NO_EVENT", __func__);
1168 }
1169
1170 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1171 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1172 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1173
1174 struct waitq *waitq;
1175 waitq = global_eventq(event);
1176 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1177 }
1178
1179 /*
1180 * assert_wait_queue:
1181 *
1182 * Return the global waitq for the specified event
1183 */
1184 struct waitq *
assert_wait_queue(event_t event)1185 assert_wait_queue(
1186 event_t event)
1187 {
1188 return global_eventq(event);
1189 }
1190
1191 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1192 assert_wait_timeout(
1193 event_t event,
1194 wait_interrupt_t interruptible,
1195 uint32_t interval,
1196 uint32_t scale_factor)
1197 {
1198 thread_t thread = current_thread();
1199 wait_result_t wresult;
1200 uint64_t deadline;
1201 spl_t s;
1202
1203 if (__improbable(event == NO_EVENT)) {
1204 panic("%s() called with NO_EVENT", __func__);
1205 }
1206
1207 struct waitq *waitq;
1208 waitq = global_eventq(event);
1209
1210 s = splsched();
1211 waitq_lock(waitq);
1212
1213 clock_interval_to_deadline(interval, scale_factor, &deadline);
1214
1215 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1216 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1217 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1218
1219 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1220 interruptible,
1221 TIMEOUT_URGENCY_SYS_NORMAL,
1222 deadline, TIMEOUT_NO_LEEWAY,
1223 thread);
1224
1225 waitq_unlock(waitq);
1226 splx(s);
1227 return wresult;
1228 }
1229
1230 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1231 assert_wait_timeout_with_leeway(
1232 event_t event,
1233 wait_interrupt_t interruptible,
1234 wait_timeout_urgency_t urgency,
1235 uint32_t interval,
1236 uint32_t leeway,
1237 uint32_t scale_factor)
1238 {
1239 thread_t thread = current_thread();
1240 wait_result_t wresult;
1241 uint64_t deadline;
1242 uint64_t abstime;
1243 uint64_t slop;
1244 uint64_t now;
1245 spl_t s;
1246
1247 if (__improbable(event == NO_EVENT)) {
1248 panic("%s() called with NO_EVENT", __func__);
1249 }
1250
1251 now = mach_absolute_time();
1252 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1253 deadline = now + abstime;
1254
1255 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1256
1257 struct waitq *waitq;
1258 waitq = global_eventq(event);
1259
1260 s = splsched();
1261 waitq_lock(waitq);
1262
1263 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1264 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1265 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1266
1267 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1268 interruptible,
1269 urgency, deadline, slop,
1270 thread);
1271
1272 waitq_unlock(waitq);
1273 splx(s);
1274 return wresult;
1275 }
1276
1277 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1278 assert_wait_deadline(
1279 event_t event,
1280 wait_interrupt_t interruptible,
1281 uint64_t deadline)
1282 {
1283 thread_t thread = current_thread();
1284 wait_result_t wresult;
1285 spl_t s;
1286
1287 if (__improbable(event == NO_EVENT)) {
1288 panic("%s() called with NO_EVENT", __func__);
1289 }
1290
1291 struct waitq *waitq;
1292 waitq = global_eventq(event);
1293
1294 s = splsched();
1295 waitq_lock(waitq);
1296
1297 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1298 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1299 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1300
1301 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1302 interruptible,
1303 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1304 TIMEOUT_NO_LEEWAY, thread);
1305 waitq_unlock(waitq);
1306 splx(s);
1307 return wresult;
1308 }
1309
1310 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1311 assert_wait_deadline_with_leeway(
1312 event_t event,
1313 wait_interrupt_t interruptible,
1314 wait_timeout_urgency_t urgency,
1315 uint64_t deadline,
1316 uint64_t leeway)
1317 {
1318 thread_t thread = current_thread();
1319 wait_result_t wresult;
1320 spl_t s;
1321
1322 if (__improbable(event == NO_EVENT)) {
1323 panic("%s() called with NO_EVENT", __func__);
1324 }
1325
1326 struct waitq *waitq;
1327 waitq = global_eventq(event);
1328
1329 s = splsched();
1330 waitq_lock(waitq);
1331
1332 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1333 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1334 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1335
1336 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1337 interruptible,
1338 urgency, deadline, leeway,
1339 thread);
1340 waitq_unlock(waitq);
1341 splx(s);
1342 return wresult;
1343 }
1344
1345 void
sched_cond_init(sched_cond_atomic_t * cond)1346 sched_cond_init(
1347 sched_cond_atomic_t *cond)
1348 {
1349 os_atomic_init(cond, SCHED_COND_INIT);
1350 }
1351
1352 wait_result_t
sched_cond_wait(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation)1353 sched_cond_wait(
1354 sched_cond_atomic_t *cond,
1355 wait_interrupt_t interruptible,
1356 thread_continue_t continuation)
1357 {
1358 assert_wait((event_t) cond, interruptible);
1359 /* clear active bit to indicate future wakeups will have to unblock this thread */
1360 sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed);
1361 if (__improbable(new_state & SCHED_COND_WAKEUP)) {
1362 /* a wakeup has been issued; undo wait assertion, ack the wakeup, and return */
1363 thread_t thread = current_thread();
1364 clear_wait(thread, THREAD_AWAKENED);
1365 sched_cond_ack(cond);
1366 return THREAD_AWAKENED;
1367 }
1368 return thread_block(continuation);
1369 }
1370
1371 sched_cond_t
sched_cond_ack(sched_cond_atomic_t * cond)1372 sched_cond_ack(
1373 sched_cond_atomic_t *cond)
1374 {
1375 sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE | SCHED_COND_WAKEUP, acquire);
1376 assert(new_cond & SCHED_COND_ACTIVE);
1377 return new_cond;
1378 }
1379
1380 kern_return_t
sched_cond_signal(sched_cond_atomic_t * cond,thread_t thread)1381 sched_cond_signal(
1382 sched_cond_atomic_t *cond,
1383 thread_t thread)
1384 {
1385 disable_preemption();
1386 sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release);
1387 if (!(old_cond & (SCHED_COND_WAKEUP | SCHED_COND_ACTIVE))) {
1388 /* this was the first wakeup to be issued AND the thread was inactive */
1389 thread_wakeup_thread((event_t) cond, thread);
1390 }
1391 enable_preemption();
1392 return KERN_SUCCESS;
1393 }
1394
1395 /*
1396 * thread_isoncpu:
1397 *
1398 * Return TRUE if a thread is running on a processor such that an AST
1399 * is needed to pull it out of userspace execution, or if executing in
1400 * the kernel, bring to a context switch boundary that would cause
1401 * thread state to be serialized in the thread PCB.
1402 *
1403 * Thread locked, returns the same way. While locked, fields
1404 * like "state" cannot change. "runq" can change only from set to unset.
1405 */
1406 static inline boolean_t
thread_isoncpu(thread_t thread)1407 thread_isoncpu(thread_t thread)
1408 {
1409 /* Not running or runnable */
1410 if (!(thread->state & TH_RUN)) {
1411 return FALSE;
1412 }
1413
1414 /* Waiting on a runqueue, not currently running */
1415 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1416 if (thread->runq != PROCESSOR_NULL) {
1417 return FALSE;
1418 }
1419
1420 /*
1421 * Thread does not have a stack yet
1422 * It could be on the stack alloc queue or preparing to be invoked
1423 */
1424 if (!thread->kernel_stack) {
1425 return FALSE;
1426 }
1427
1428 /*
1429 * Thread must be running on a processor, or
1430 * about to run, or just did run. In all these
1431 * cases, an AST to the processor is needed
1432 * to guarantee that the thread is kicked out
1433 * of userspace and the processor has
1434 * context switched (and saved register state).
1435 */
1436 return TRUE;
1437 }
1438
1439 /*
1440 * thread_stop:
1441 *
1442 * Force a preemption point for a thread and wait
1443 * for it to stop running on a CPU. If a stronger
1444 * guarantee is requested, wait until no longer
1445 * runnable. Arbitrates access among
1446 * multiple stop requests. (released by unstop)
1447 *
1448 * The thread must enter a wait state and stop via a
1449 * separate means.
1450 *
1451 * Returns FALSE if interrupted.
1452 */
1453 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1454 thread_stop(
1455 thread_t thread,
1456 boolean_t until_not_runnable)
1457 {
1458 wait_result_t wresult;
1459 spl_t s = splsched();
1460 boolean_t oncpu;
1461
1462 wake_lock(thread);
1463 thread_lock(thread);
1464
1465 while (thread->state & TH_SUSP) {
1466 thread->wake_active = TRUE;
1467 thread_unlock(thread);
1468
1469 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1470 wake_unlock(thread);
1471 splx(s);
1472
1473 if (wresult == THREAD_WAITING) {
1474 wresult = thread_block(THREAD_CONTINUE_NULL);
1475 }
1476
1477 if (wresult != THREAD_AWAKENED) {
1478 return FALSE;
1479 }
1480
1481 s = splsched();
1482 wake_lock(thread);
1483 thread_lock(thread);
1484 }
1485
1486 thread->state |= TH_SUSP;
1487
1488 while ((oncpu = thread_isoncpu(thread)) ||
1489 (until_not_runnable && (thread->state & TH_RUN))) {
1490 processor_t processor;
1491
1492 if (oncpu) {
1493 assert(thread->state & TH_RUN);
1494 processor = thread->chosen_processor;
1495 cause_ast_check(processor);
1496 }
1497
1498 thread->wake_active = TRUE;
1499 thread_unlock(thread);
1500
1501 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1502 wake_unlock(thread);
1503 splx(s);
1504
1505 if (wresult == THREAD_WAITING) {
1506 wresult = thread_block(THREAD_CONTINUE_NULL);
1507 }
1508
1509 if (wresult != THREAD_AWAKENED) {
1510 thread_unstop(thread);
1511 return FALSE;
1512 }
1513
1514 s = splsched();
1515 wake_lock(thread);
1516 thread_lock(thread);
1517 }
1518
1519 thread_unlock(thread);
1520 wake_unlock(thread);
1521 splx(s);
1522
1523 /*
1524 * We return with the thread unlocked. To prevent it from
1525 * transitioning to a runnable state (or from TH_RUN to
1526 * being on the CPU), the caller must ensure the thread
1527 * is stopped via an external means (such as an AST)
1528 */
1529
1530 return TRUE;
1531 }
1532
1533 /*
1534 * thread_unstop:
1535 *
1536 * Release a previous stop request and set
1537 * the thread running if appropriate.
1538 *
1539 * Use only after a successful stop operation.
1540 */
1541 void
thread_unstop(thread_t thread)1542 thread_unstop(
1543 thread_t thread)
1544 {
1545 spl_t s = splsched();
1546
1547 wake_lock(thread);
1548 thread_lock(thread);
1549
1550 assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1551
1552 if (thread->state & TH_SUSP) {
1553 thread->state &= ~TH_SUSP;
1554
1555 if (thread->wake_active) {
1556 thread->wake_active = FALSE;
1557 thread_unlock(thread);
1558
1559 thread_wakeup(&thread->wake_active);
1560 wake_unlock(thread);
1561 splx(s);
1562
1563 return;
1564 }
1565 }
1566
1567 thread_unlock(thread);
1568 wake_unlock(thread);
1569 splx(s);
1570 }
1571
1572 /*
1573 * thread_wait:
1574 *
1575 * Wait for a thread to stop running. (non-interruptible)
1576 *
1577 */
1578 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1579 thread_wait(
1580 thread_t thread,
1581 boolean_t until_not_runnable)
1582 {
1583 wait_result_t wresult;
1584 boolean_t oncpu;
1585 processor_t processor;
1586 spl_t s = splsched();
1587
1588 wake_lock(thread);
1589 thread_lock(thread);
1590
1591 /*
1592 * Wait until not running on a CPU. If stronger requirement
1593 * desired, wait until not runnable. Assumption: if thread is
1594 * on CPU, then TH_RUN is set, so we're not waiting in any case
1595 * where the original, pure "TH_RUN" check would have let us
1596 * finish.
1597 */
1598 while ((oncpu = thread_isoncpu(thread)) ||
1599 (until_not_runnable && (thread->state & TH_RUN))) {
1600 if (oncpu) {
1601 assert(thread->state & TH_RUN);
1602 processor = thread->chosen_processor;
1603 cause_ast_check(processor);
1604 }
1605
1606 thread->wake_active = TRUE;
1607 thread_unlock(thread);
1608
1609 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1610 wake_unlock(thread);
1611 splx(s);
1612
1613 if (wresult == THREAD_WAITING) {
1614 thread_block(THREAD_CONTINUE_NULL);
1615 }
1616
1617 s = splsched();
1618 wake_lock(thread);
1619 thread_lock(thread);
1620 }
1621
1622 thread_unlock(thread);
1623 wake_unlock(thread);
1624 splx(s);
1625 }
1626
1627 /*
1628 * Routine: clear_wait_internal
1629 *
1630 * Clear the wait condition for the specified thread.
1631 * Start the thread executing if that is appropriate.
1632 * Arguments:
1633 * thread thread to awaken
1634 * result Wakeup result the thread should see
1635 * Conditions:
1636 * At splsched
1637 * the thread is locked.
1638 * Returns:
1639 * KERN_SUCCESS thread was rousted out a wait
1640 * KERN_FAILURE thread was waiting but could not be rousted
1641 * KERN_NOT_WAITING thread was not waiting
1642 */
1643 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1644 clear_wait_internal(
1645 thread_t thread,
1646 wait_result_t wresult)
1647 {
1648 waitq_t waitq = thread->waitq;
1649
1650 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1651 return KERN_FAILURE;
1652 }
1653
1654 if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1655 return KERN_NOT_WAITING;
1656 }
1657
1658 /* TODO: Can we instead assert TH_TERMINATE is not set? */
1659 if ((thread->state & (TH_WAIT | TH_TERMINATE)) != TH_WAIT) {
1660 return KERN_NOT_WAITING;
1661 }
1662
1663 return thread_go(thread, wresult, /* handoff */ false);
1664 }
1665
1666
1667 /*
1668 * clear_wait:
1669 *
1670 * Clear the wait condition for the specified thread. Start the thread
1671 * executing if that is appropriate.
1672 *
1673 * parameters:
1674 * thread thread to awaken
1675 * result Wakeup result the thread should see
1676 */
1677 kern_return_t
clear_wait(thread_t thread,wait_result_t result)1678 clear_wait(
1679 thread_t thread,
1680 wait_result_t result)
1681 {
1682 kern_return_t ret;
1683 spl_t s;
1684
1685 s = splsched();
1686 thread_lock(thread);
1687 ret = clear_wait_internal(thread, result);
1688 thread_unlock(thread);
1689 splx(s);
1690 return ret;
1691 }
1692
1693
1694 /*
1695 * thread_wakeup_prim:
1696 *
1697 * Common routine for thread_wakeup, thread_wakeup_with_result,
1698 * and thread_wakeup_one.
1699 *
1700 */
1701 kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1702 thread_wakeup_prim(
1703 event_t event,
1704 boolean_t one_thread,
1705 wait_result_t result)
1706 {
1707 if (__improbable(event == NO_EVENT)) {
1708 panic("%s() called with NO_EVENT", __func__);
1709 }
1710
1711 struct waitq *wq = global_eventq(event);
1712
1713 if (one_thread) {
1714 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1715 } else {
1716 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1717 }
1718 }
1719
1720 /*
1721 * Wakeup a specified thread if and only if it's waiting for this event
1722 */
1723 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1724 thread_wakeup_thread(
1725 event_t event,
1726 thread_t thread)
1727 {
1728 if (__improbable(event == NO_EVENT)) {
1729 panic("%s() called with NO_EVENT", __func__);
1730 }
1731
1732 if (__improbable(thread == THREAD_NULL)) {
1733 panic("%s() called with THREAD_NULL", __func__);
1734 }
1735
1736 struct waitq *wq = global_eventq(event);
1737
1738 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1739 }
1740
1741 /*
1742 * Wakeup a thread waiting on an event and promote it to a priority.
1743 *
1744 * Requires woken thread to un-promote itself when done.
1745 */
1746 kern_return_t
thread_wakeup_one_with_pri(event_t event,int priority)1747 thread_wakeup_one_with_pri(
1748 event_t event,
1749 int priority)
1750 {
1751 if (__improbable(event == NO_EVENT)) {
1752 panic("%s() called with NO_EVENT", __func__);
1753 }
1754
1755 struct waitq *wq = global_eventq(event);
1756
1757 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1758 }
1759
1760 /*
1761 * Wakeup a thread waiting on an event,
1762 * promote it to a priority,
1763 * and return a reference to the woken thread.
1764 *
1765 * Requires woken thread to un-promote itself when done.
1766 */
1767 thread_t
thread_wakeup_identify(event_t event,int priority)1768 thread_wakeup_identify(event_t event,
1769 int priority)
1770 {
1771 if (__improbable(event == NO_EVENT)) {
1772 panic("%s() called with NO_EVENT", __func__);
1773 }
1774
1775 struct waitq *wq = global_eventq(event);
1776
1777 return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1778 }
1779
1780 /*
1781 * thread_bind:
1782 *
1783 * Force the current thread to execute on the specified processor.
1784 * Takes effect after the next thread_block().
1785 *
1786 * Returns the previous binding. PROCESSOR_NULL means
1787 * not bound.
1788 *
1789 * XXX - DO NOT export this to users - XXX
1790 */
1791 processor_t
thread_bind(processor_t processor)1792 thread_bind(
1793 processor_t processor)
1794 {
1795 thread_t self = current_thread();
1796 processor_t prev;
1797 spl_t s;
1798
1799 s = splsched();
1800 thread_lock(self);
1801
1802 prev = thread_bind_internal(self, processor);
1803
1804 thread_unlock(self);
1805 splx(s);
1806
1807 return prev;
1808 }
1809
1810 /*
1811 * thread_bind_internal:
1812 *
1813 * If the specified thread is not the current thread, and it is currently
1814 * running on another CPU, a remote AST must be sent to that CPU to cause
1815 * the thread to migrate to its bound processor. Otherwise, the migration
1816 * will occur at the next quantum expiration or blocking point.
1817 *
1818 * When the thread is the current thread, and explicit thread_block() should
1819 * be used to force the current processor to context switch away and
1820 * let the thread migrate to the bound processor.
1821 *
1822 * Thread must be locked, and at splsched.
1823 */
1824
1825 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)1826 thread_bind_internal(
1827 thread_t thread,
1828 processor_t processor)
1829 {
1830 processor_t prev;
1831
1832 /* <rdar://problem/15102234> */
1833 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1834 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1835 assert(thread->runq == PROCESSOR_NULL);
1836
1837 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1838
1839 prev = thread->bound_processor;
1840 thread->bound_processor = processor;
1841
1842 return prev;
1843 }
1844
1845 /*
1846 * thread_vm_bind_group_add:
1847 *
1848 * The "VM bind group" is a special mechanism to mark a collection
1849 * of threads from the VM subsystem that, in general, should be scheduled
1850 * with only one CPU of parallelism. To accomplish this, we initially
1851 * bind all the threads to the master processor, which has the effect
1852 * that only one of the threads in the group can execute at once, including
1853 * preempting threads in the group that are a lower priority. Future
1854 * mechanisms may use more dynamic mechanisms to prevent the collection
1855 * of VM threads from using more CPU time than desired.
1856 *
1857 * The current implementation can result in priority inversions where
1858 * compute-bound priority 95 or realtime threads that happen to have
1859 * landed on the master processor prevent the VM threads from running.
1860 * When this situation is detected, we unbind the threads for one
1861 * scheduler tick to allow the scheduler to run the threads an
1862 * additional CPUs, before restoring the binding (assuming high latency
1863 * is no longer a problem).
1864 */
1865
1866 /*
1867 * The current max is provisioned for:
1868 * vm_compressor_swap_trigger_thread (92)
1869 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1870 * vm_pageout_continue (92)
1871 * memorystatus_thread (95)
1872 */
1873 #define MAX_VM_BIND_GROUP_COUNT (5)
1874 decl_simple_lock_data(static, sched_vm_group_list_lock);
1875 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1876 static int sched_vm_group_thread_count;
1877 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1878
1879 void
thread_vm_bind_group_add(void)1880 thread_vm_bind_group_add(void)
1881 {
1882 thread_t self = current_thread();
1883
1884 thread_reference(self);
1885 self->options |= TH_OPT_SCHED_VM_GROUP;
1886
1887 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1888 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1889 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1890 simple_unlock(&sched_vm_group_list_lock);
1891
1892 thread_bind(master_processor);
1893
1894 /* Switch to bound processor if not already there */
1895 thread_block(THREAD_CONTINUE_NULL);
1896 }
1897
1898 static void
sched_vm_group_maintenance(void)1899 sched_vm_group_maintenance(void)
1900 {
1901 uint64_t ctime = mach_absolute_time();
1902 uint64_t longtime = ctime - sched_tick_interval;
1903 int i;
1904 spl_t s;
1905 boolean_t high_latency_observed = FALSE;
1906 boolean_t runnable_and_not_on_runq_observed = FALSE;
1907 boolean_t bind_target_changed = FALSE;
1908 processor_t bind_target = PROCESSOR_NULL;
1909
1910 /* Make sure nobody attempts to add new threads while we are enumerating them */
1911 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1912
1913 s = splsched();
1914
1915 for (i = 0; i < sched_vm_group_thread_count; i++) {
1916 thread_t thread = sched_vm_group_thread_list[i];
1917 assert(thread != THREAD_NULL);
1918 thread_lock(thread);
1919 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
1920 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1921 high_latency_observed = TRUE;
1922 } else if (thread->runq == PROCESSOR_NULL) {
1923 /* There are some cases where a thread be transitiong that also fall into this case */
1924 runnable_and_not_on_runq_observed = TRUE;
1925 }
1926 }
1927 thread_unlock(thread);
1928
1929 if (high_latency_observed && runnable_and_not_on_runq_observed) {
1930 /* All the things we are looking for are true, stop looking */
1931 break;
1932 }
1933 }
1934
1935 splx(s);
1936
1937 if (sched_vm_group_temporarily_unbound) {
1938 /* If we turned off binding, make sure everything is OK before rebinding */
1939 if (!high_latency_observed) {
1940 /* rebind */
1941 bind_target_changed = TRUE;
1942 bind_target = master_processor;
1943 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1944 }
1945 } else {
1946 /*
1947 * Check if we're in a bad state, which is defined by high
1948 * latency with no core currently executing a thread. If a
1949 * single thread is making progress on a CPU, that means the
1950 * binding concept to reduce parallelism is working as
1951 * designed.
1952 */
1953 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1954 /* unbind */
1955 bind_target_changed = TRUE;
1956 bind_target = PROCESSOR_NULL;
1957 sched_vm_group_temporarily_unbound = TRUE;
1958 }
1959 }
1960
1961 if (bind_target_changed) {
1962 s = splsched();
1963 for (i = 0; i < sched_vm_group_thread_count; i++) {
1964 thread_t thread = sched_vm_group_thread_list[i];
1965 boolean_t removed;
1966 assert(thread != THREAD_NULL);
1967
1968 thread_lock(thread);
1969 removed = thread_run_queue_remove(thread);
1970 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1971 thread_bind_internal(thread, bind_target);
1972 } else {
1973 /*
1974 * Thread was in the middle of being context-switched-to,
1975 * or was in the process of blocking. To avoid switching the bind
1976 * state out mid-flight, defer the change if possible.
1977 */
1978 if (bind_target == PROCESSOR_NULL) {
1979 thread_bind_internal(thread, bind_target);
1980 } else {
1981 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1982 }
1983 }
1984
1985 if (removed) {
1986 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1987 }
1988 thread_unlock(thread);
1989 }
1990 splx(s);
1991 }
1992
1993 simple_unlock(&sched_vm_group_list_lock);
1994 }
1995
1996 #if defined(__x86_64__)
1997 #define SCHED_AVOID_CPU0 1
1998 #else
1999 #define SCHED_AVOID_CPU0 0
2000 #endif
2001
2002 int sched_allow_rt_smt = 1;
2003 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
2004 int sched_allow_rt_steal = 1;
2005 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
2006
2007 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
2008
2009 int
sched_get_rt_n_backup_processors(void)2010 sched_get_rt_n_backup_processors(void)
2011 {
2012 return sched_rt_n_backup_processors;
2013 }
2014
2015 void
sched_set_rt_n_backup_processors(int n)2016 sched_set_rt_n_backup_processors(int n)
2017 {
2018 if (n < 0) {
2019 n = 0;
2020 } else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
2021 n = SCHED_MAX_BACKUP_PROCESSORS;
2022 }
2023
2024 sched_rt_n_backup_processors = n;
2025 }
2026
2027 int sched_rt_runq_strict_priority = false;
2028
2029 inline static processor_set_t
change_locked_pset(processor_set_t current_pset,processor_set_t new_pset)2030 change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
2031 {
2032 if (current_pset != new_pset) {
2033 pset_unlock(current_pset);
2034 pset_lock(new_pset);
2035 }
2036
2037 return new_pset;
2038 }
2039
2040 /*
2041 * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2042 * rebalancing opportunity exists when a core is (instantaneously) idle, but
2043 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
2044 * IPI thrash if this core does not remain idle following the load balancing ASTs
2045 * Idle "thrash", when IPI issue is followed by idle entry/core power down
2046 * followed by a wakeup shortly thereafter.
2047 */
2048
2049 #if (DEVELOPMENT || DEBUG)
2050 int sched_smt_balance = 1;
2051 #endif
2052
2053 /* Invoked with pset locked, returns with pset unlocked */
2054 void
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)2055 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2056 {
2057 processor_t ast_processor = NULL;
2058
2059 #if (DEVELOPMENT || DEBUG)
2060 if (__improbable(sched_smt_balance == 0)) {
2061 goto smt_balance_exit;
2062 }
2063 #endif
2064
2065 assert(cprocessor == current_processor());
2066 if (cprocessor->is_SMT == FALSE) {
2067 goto smt_balance_exit;
2068 }
2069
2070 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2071
2072 /* Determine if both this processor and its sibling are idle,
2073 * indicating an SMT rebalancing opportunity.
2074 */
2075 if (sib_processor->state != PROCESSOR_IDLE) {
2076 goto smt_balance_exit;
2077 }
2078
2079 processor_t sprocessor;
2080
2081 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2082 uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2083 ~cpset->primary_map);
2084 for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2085 sprocessor = processor_array[cpuid];
2086 if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2087 (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2088 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2089 if (ipi_type != SCHED_IPI_NONE) {
2090 assert(sprocessor != cprocessor);
2091 ast_processor = sprocessor;
2092 break;
2093 }
2094 }
2095 }
2096
2097 smt_balance_exit:
2098 pset_unlock(cpset);
2099
2100 if (ast_processor) {
2101 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2102 sched_ipi_perform(ast_processor, ipi_type);
2103 }
2104 }
2105
2106 static cpumap_t
pset_available_cpumap(processor_set_t pset)2107 pset_available_cpumap(processor_set_t pset)
2108 {
2109 return pset->cpu_available_map & pset->recommended_bitmask;
2110 }
2111
2112 int
pset_available_cpu_count(processor_set_t pset)2113 pset_available_cpu_count(processor_set_t pset)
2114 {
2115 return bit_count(pset_available_cpumap(pset));
2116 }
2117
2118 bool
pset_is_recommended(processor_set_t pset)2119 pset_is_recommended(processor_set_t pset)
2120 {
2121 if (!pset) {
2122 return false;
2123 }
2124 return pset_available_cpu_count(pset) > 0;
2125 }
2126
2127 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2128 pset_available_but_not_running_cpumap(processor_set_t pset)
2129 {
2130 return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2131 pset->recommended_bitmask;
2132 }
2133
2134 bool
pset_has_stealable_threads(processor_set_t pset)2135 pset_has_stealable_threads(processor_set_t pset)
2136 {
2137 pset_assert_locked(pset);
2138
2139 cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2140 /*
2141 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2142 * available primary CPUs
2143 */
2144 avail_map &= pset->primary_map;
2145
2146 return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2147 }
2148
2149 static cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)2150 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2151 {
2152 cpumap_t avail_map = pset_available_cpumap(pset);
2153 if (!sched_allow_rt_smt) {
2154 /*
2155 * Secondary CPUs are not allowed to run RT threads, so
2156 * only primary CPUs should be included
2157 */
2158 avail_map &= pset->primary_map;
2159 }
2160
2161 return avail_map & ~pset->realtime_map;
2162 }
2163
2164 static bool
pset_needs_a_followup_IPI(processor_set_t pset)2165 pset_needs_a_followup_IPI(processor_set_t pset)
2166 {
2167 int nbackup_cpus = 0;
2168
2169 if (rt_runq_is_low_latency(pset)) {
2170 nbackup_cpus = sched_rt_n_backup_processors;
2171 }
2172
2173 int rt_rq_count = rt_runq_count(pset);
2174
2175 return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
2176 }
2177
2178 bool
pset_has_stealable_rt_threads(processor_set_t pset)2179 pset_has_stealable_rt_threads(processor_set_t pset)
2180 {
2181 pset_node_t node = pset->node;
2182 if (bit_count(node->pset_map) == 1) {
2183 return false;
2184 }
2185
2186 cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2187
2188 return rt_runq_count(pset) > bit_count(avail_map);
2189 }
2190
2191 static void
pset_update_rt_stealable_state(processor_set_t pset)2192 pset_update_rt_stealable_state(processor_set_t pset)
2193 {
2194 if (pset_has_stealable_rt_threads(pset)) {
2195 pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2196 } else {
2197 pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2198 }
2199 }
2200
2201 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2202 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2203 {
2204 /* Acknowledge any pending IPIs here with pset lock held */
2205 pset_assert_locked(pset);
2206 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2207 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2208 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2209 }
2210 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2211
2212 #if defined(CONFIG_SCHED_DEFERRED_AST)
2213 bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2214 #endif
2215 }
2216
2217 /*
2218 * Called with pset locked, on a processor that is committing to run a new thread
2219 * Will transition an idle or dispatching processor to running as it picks up
2220 * the first new thread from the idle thread.
2221 */
2222 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2223 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2224 {
2225 pset_assert_locked(pset);
2226
2227 if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2228 assert(current_thread() == processor->idle_thread);
2229
2230 /*
2231 * Dispatching processor is now committed to running new_thread,
2232 * so change its state to PROCESSOR_RUNNING.
2233 */
2234 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2235 } else {
2236 assert((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_SHUTDOWN));
2237 }
2238
2239 processor_state_update_from_thread(processor, new_thread, true);
2240
2241 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2242 bit_set(pset->realtime_map, processor->cpu_id);
2243 } else {
2244 bit_clear(pset->realtime_map, processor->cpu_id);
2245 }
2246 pset_update_rt_stealable_state(pset);
2247
2248 pset_node_t node = pset->node;
2249
2250 if (bit_count(node->pset_map) == 1) {
2251 /* Node has only a single pset, so skip node pset map updates */
2252 return;
2253 }
2254
2255 cpumap_t avail_map = pset_available_cpumap(pset);
2256
2257 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2258 if ((avail_map & pset->realtime_map) == avail_map) {
2259 /* No more non-RT CPUs in this pset */
2260 atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2261 }
2262 avail_map &= pset->primary_map;
2263 if ((avail_map & pset->realtime_map) == avail_map) {
2264 /* No more non-RT primary CPUs in this pset */
2265 atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2266 }
2267 } else {
2268 if ((avail_map & pset->realtime_map) != avail_map) {
2269 if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2270 atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2271 }
2272 }
2273 avail_map &= pset->primary_map;
2274 if ((avail_map & pset->realtime_map) != avail_map) {
2275 if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2276 atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2277 }
2278 }
2279 }
2280 }
2281
2282 static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2283 static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2284 processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2285 static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2286 #if defined(__x86_64__)
2287 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2288 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2289 #endif
2290 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2291 static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2292
2293 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2294 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2295 {
2296 pset_map_t pset_map = stealing_pset->node->pset_map;
2297
2298 bit_clear(pset_map, stealing_pset->pset_id);
2299
2300 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2301 processor_set_t nset = pset_array[pset_id];
2302
2303 if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) {
2304 return true;
2305 }
2306 }
2307
2308 return false;
2309 }
2310
2311 /*
2312 * starting_pset must be locked, but returns true if it is unlocked before return
2313 */
2314 static bool
choose_next_rt_processor_for_IPI(processor_set_t starting_pset,processor_t chosen_processor,bool spill_ipi,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)2315 choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2316 processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2317 {
2318 bool starting_pset_is_unlocked = false;
2319 uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
2320 int max_pri = rt_runq_priority(starting_pset);
2321 __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
2322 processor_set_t pset = starting_pset;
2323 processor_t next_rt_processor = PROCESSOR_NULL;
2324 if (spill_ipi) {
2325 processor_set_t nset = next_pset(pset);
2326 assert(nset != starting_pset);
2327 pset = change_locked_pset(pset, nset);
2328 starting_pset_is_unlocked = true;
2329 }
2330 do {
2331 const bool consider_secondaries = true;
2332 next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries);
2333 if (next_rt_processor == PROCESSOR_NULL) {
2334 if (!spill_ipi) {
2335 break;
2336 }
2337 processor_set_t nset = next_pset(pset);
2338 if (nset == starting_pset) {
2339 break;
2340 }
2341 pset = change_locked_pset(pset, nset);
2342 starting_pset_is_unlocked = true;
2343 }
2344 } while (next_rt_processor == PROCESSOR_NULL);
2345 if (next_rt_processor) {
2346 if (pset != starting_pset) {
2347 if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2348 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2349 next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2350 }
2351 }
2352 *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
2353 *result_processor = next_rt_processor;
2354 }
2355 if (pset != starting_pset) {
2356 pset_unlock(pset);
2357 }
2358
2359 return starting_pset_is_unlocked;
2360 }
2361
2362 /*
2363 * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2364 * followup processor - used in thread_select when there are still threads on the run queue and available processors
2365 * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2366 */
2367 typedef enum {
2368 none,
2369 backup,
2370 followup,
2371 spill
2372 } next_processor_type_t;
2373
2374 #undef LOOP_COUNT
2375 #ifdef LOOP_COUNT
2376 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2377 #endif
2378
2379 /*
2380 * thread_select:
2381 *
2382 * Select a new thread for the current processor to execute.
2383 *
2384 * May select the current thread, which must be locked.
2385 */
2386 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2387 thread_select(thread_t thread,
2388 processor_t processor,
2389 ast_t *reason)
2390 {
2391 processor_set_t pset = processor->processor_set;
2392 thread_t new_thread = THREAD_NULL;
2393
2394 assert(processor == current_processor());
2395 assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2396
2397 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2398 0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2399
2400 __kdebug_only int idle_reason = 0;
2401 __kdebug_only int delay_count = 0;
2402
2403 #if defined(__x86_64__)
2404 int timeout_count = sched_backup_cpu_timeout_count;
2405 if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2406 /* Prefer cpu0 as backup */
2407 timeout_count--;
2408 } else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2409 /* Prefer secondary cpu as backup */
2410 timeout_count--;
2411 }
2412 #endif
2413 bool pending_AST_URGENT = false;
2414 bool pending_AST_PREEMPT = false;
2415
2416 #ifdef LOOP_COUNT
2417 int loop_count = -1;
2418 #endif
2419
2420 do {
2421 /*
2422 * Update the priority.
2423 */
2424 if (SCHED(can_update_priority)(thread)) {
2425 SCHED(update_priority)(thread);
2426 }
2427
2428 pset_lock(pset);
2429
2430 restart:
2431 #ifdef LOOP_COUNT
2432 loop_count++;
2433 if (loop_count > max_loop_count[processor->cpu_id]) {
2434 max_loop_count[processor->cpu_id] = loop_count;
2435 if (bit_count(loop_count) == 1) {
2436 kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2437 }
2438 }
2439 #endif
2440 pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2441 pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2442
2443 processor_state_update_from_thread(processor, thread, true);
2444
2445 idle_reason = 0;
2446
2447 processor_t ast_processor = PROCESSOR_NULL;
2448 processor_t next_rt_processor = PROCESSOR_NULL;
2449 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2450 sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2451
2452 assert(processor->state != PROCESSOR_OFF_LINE);
2453
2454 /*
2455 * Bound threads are dispatched to a processor without going through
2456 * choose_processor(), so in those cases we must continue trying to dequeue work
2457 * as we are the only option.
2458 */
2459 if (!SCHED(processor_bound_count)(processor)) {
2460 if (!processor->is_recommended) {
2461 /*
2462 * The performance controller has provided a hint to not dispatch more threads,
2463 */
2464 idle_reason = 1;
2465 goto send_followup_ipi_before_idle;
2466 } else if (rt_runq_count(pset)) {
2467 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2468 /* Give the current RT thread a chance to complete */
2469 ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2470 #if defined(__x86_64__)
2471 /*
2472 * On Intel we want to avoid SMT secondary processors and processor 0
2473 * but allow them to be used as backup processors in case the preferred chosen
2474 * processor is delayed by interrupts or processor stalls. So if it is
2475 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2476 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2477 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2478 * to grab the thread before the (current) backup processor does.
2479 *
2480 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2481 * on DEVELOPMENT || DEBUG kernels. It is also adjusted (see above) depending on whether we want to use
2482 * cpu0 before secondary cpus or not.
2483 */
2484 if (!ok_to_run_realtime_thread) {
2485 if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2486 if (timeout_count-- > 0) {
2487 pset_unlock(pset);
2488 thread_unlock(thread);
2489 delay(10);
2490 delay_count++;
2491 thread_lock(thread);
2492 pset_lock(pset);
2493 goto restart;
2494 }
2495 ok_to_run_realtime_thread = true;
2496 }
2497 }
2498 #endif
2499 if (!ok_to_run_realtime_thread) {
2500 idle_reason = 2;
2501 goto send_followup_ipi_before_idle;
2502 }
2503 } else if (processor->processor_primary != processor) {
2504 /*
2505 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2506 * we should look for work only under the same conditions that choose_processor()
2507 * would have assigned work, which is when all primary processors have been assigned work.
2508 */
2509 if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2510 /* There are idle primaries */
2511 idle_reason = 3;
2512 goto idle;
2513 }
2514 }
2515 }
2516
2517 /*
2518 * Test to see if the current thread should continue
2519 * to run on this processor. Must not be attempting to wait, and not
2520 * bound to a different processor, nor be in the wrong
2521 * processor set, nor be forced to context switch by TH_SUSP.
2522 *
2523 * Note that there are never any RT threads in the regular runqueue.
2524 *
2525 * This code is very insanely tricky.
2526 */
2527
2528 /* i.e. not waiting, not TH_SUSP'ed */
2529 bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2530
2531 /*
2532 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2533 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2534 * <rdar://problem/47907700>
2535 *
2536 * A yielding thread shouldn't be forced to context switch.
2537 */
2538
2539 bool is_yielding = (*reason & AST_YIELD) == AST_YIELD;
2540
2541 bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2542
2543 bool affinity_mismatch = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2544
2545 bool bound_elsewhere = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2546
2547 bool avoid_processor = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread);
2548
2549 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2550
2551 bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2552 if (current_thread_can_keep_running) {
2553 /*
2554 * This thread is eligible to keep running on this processor.
2555 *
2556 * RT threads with un-expired quantum stay on processor,
2557 * unless there's a valid RT thread with an earlier deadline
2558 * and it is still ok_to_run_realtime_thread.
2559 */
2560 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2561 /*
2562 * Pick a new RT thread only if ok_to_run_realtime_thread
2563 * (but the current thread is allowed to complete).
2564 */
2565 if (ok_to_run_realtime_thread) {
2566 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2567 goto pick_new_rt_thread;
2568 }
2569 if (rt_runq_priority(pset) > thread->sched_pri) {
2570 if (sched_rt_runq_strict_priority) {
2571 /* The next RT thread is better, so pick it off the runqueue. */
2572 goto pick_new_rt_thread;
2573 }
2574
2575 /*
2576 * See if the current lower priority thread can continue to run without causing
2577 * the higher priority thread on the runq queue to miss its deadline.
2578 */
2579 thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2580 if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2581 /* The next RT thread is better, so pick it off the runqueue. */
2582 goto pick_new_rt_thread;
2583 }
2584 } else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2585 /* The next RT thread is better, so pick it off the runqueue. */
2586 goto pick_new_rt_thread;
2587 }
2588 if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2589 goto pick_new_rt_thread;
2590 }
2591 }
2592
2593 /* This is still the best RT thread to run. */
2594 processor->deadline = thread->realtime.deadline;
2595
2596 sched_update_pset_load_average(pset, 0);
2597
2598 clear_pending_AST_bits(pset, processor, 1);
2599
2600 next_rt_processor = PROCESSOR_NULL;
2601 next_rt_ipi_type = SCHED_IPI_NONE;
2602
2603 bool pset_unlocked = false;
2604 __kdebug_only next_processor_type_t nptype = none;
2605 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2606 nptype = spill;
2607 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2608 } else if (pset_needs_a_followup_IPI(pset)) {
2609 nptype = followup;
2610 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2611 }
2612 if (!pset_unlocked) {
2613 pset_unlock(pset);
2614 }
2615
2616 if (next_rt_processor) {
2617 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2618 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2619 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2620 }
2621
2622 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2623 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2624 return thread;
2625 }
2626
2627 if ((rt_runq_count(pset) == 0) &&
2628 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2629 /* This thread is still the highest priority runnable (non-idle) thread */
2630 processor->deadline = RT_DEADLINE_NONE;
2631
2632 sched_update_pset_load_average(pset, 0);
2633
2634 clear_pending_AST_bits(pset, processor, 2);
2635
2636 pset_unlock(pset);
2637
2638 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2639 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2640 return thread;
2641 }
2642 } else {
2643 /*
2644 * This processor must context switch.
2645 * If it's due to a rebalance, we should aggressively find this thread a new home.
2646 */
2647 if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) {
2648 *reason |= AST_REBALANCE;
2649 }
2650 }
2651
2652 bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2653 (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2654 (processor->processor_secondary->state == PROCESSOR_IDLE));
2655
2656 /* OK, so we're not going to run the current thread. Look at the RT queue. */
2657 if (ok_to_run_realtime_thread) {
2658 pick_new_rt_thread:
2659 new_thread = sched_rt_choose_thread(pset);
2660 if (new_thread != THREAD_NULL) {
2661 processor->deadline = new_thread->realtime.deadline;
2662 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2663
2664 clear_pending_AST_bits(pset, processor, 3);
2665
2666 if (processor->processor_secondary != NULL) {
2667 processor_t sprocessor = processor->processor_secondary;
2668 if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2669 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2670 ast_processor = sprocessor;
2671 }
2672 }
2673 }
2674 }
2675
2676 send_followup_ipi_before_idle:
2677 /* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2678 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2679 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2680 }
2681 __kdebug_only next_processor_type_t nptype = none;
2682 bool pset_unlocked = false;
2683 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2684 nptype = spill;
2685 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2686 } else if (pset_needs_a_followup_IPI(pset)) {
2687 nptype = followup;
2688 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2689 }
2690
2691 assert(new_thread || !ast_processor);
2692 if (new_thread || next_rt_processor) {
2693 if (!pset_unlocked) {
2694 pset_unlock(pset);
2695 pset_unlocked = true;
2696 }
2697 if (ast_processor == next_rt_processor) {
2698 ast_processor = PROCESSOR_NULL;
2699 ipi_type = SCHED_IPI_NONE;
2700 }
2701
2702 if (ast_processor) {
2703 sched_ipi_perform(ast_processor, ipi_type);
2704 }
2705
2706 if (next_rt_processor) {
2707 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2708 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2709 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2710 }
2711
2712 if (new_thread) {
2713 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2714 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2715 return new_thread;
2716 }
2717 }
2718
2719 if (pset_unlocked) {
2720 pset_lock(pset);
2721 }
2722
2723 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2724 /* Things changed while we dropped the lock */
2725 goto restart;
2726 }
2727
2728 if (processor->is_recommended) {
2729 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2730 if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2731 /* Things changed while we dropped the lock */
2732 goto restart;
2733 }
2734
2735 if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2736 /* secondary can only run realtime thread */
2737 if (idle_reason == 0) {
2738 idle_reason = 4;
2739 }
2740 goto idle;
2741 }
2742 } else if (!SCHED(processor_bound_count)(processor)) {
2743 /* processor not recommended and no bound threads */
2744 if (idle_reason == 0) {
2745 idle_reason = 5;
2746 }
2747 goto idle;
2748 }
2749
2750 processor->deadline = RT_DEADLINE_NONE;
2751
2752 /* No RT threads, so let's look at the regular threads. */
2753 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2754 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2755
2756 clear_pending_AST_bits(pset, processor, 4);
2757
2758 ast_processor = PROCESSOR_NULL;
2759 ipi_type = SCHED_IPI_NONE;
2760
2761 processor_t sprocessor = processor->processor_secondary;
2762 if (sprocessor != NULL) {
2763 if (sprocessor->state == PROCESSOR_RUNNING) {
2764 if (thread_no_smt(new_thread)) {
2765 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2766 ast_processor = sprocessor;
2767 }
2768 } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
2769 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
2770 ast_processor = sprocessor;
2771 }
2772 }
2773 pset_unlock(pset);
2774
2775 if (ast_processor) {
2776 sched_ipi_perform(ast_processor, ipi_type);
2777 }
2778 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2779 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2780 return new_thread;
2781 }
2782
2783 if (processor->must_idle) {
2784 processor->must_idle = false;
2785 *reason |= AST_REBALANCE;
2786 idle_reason = 6;
2787 goto idle;
2788 }
2789
2790 if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2791 /*
2792 * No runnable threads, attempt to steal
2793 * from other processors. Returns with pset lock dropped.
2794 */
2795
2796 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2797 pset_lock(pset);
2798 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2799 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2800 /*
2801 * A realtime thread choose this processor while it was DISPATCHING
2802 * and the pset lock was dropped
2803 */
2804 ast_on(AST_URGENT | AST_PREEMPT);
2805 }
2806
2807 clear_pending_AST_bits(pset, processor, 5);
2808
2809 pset_unlock(pset);
2810
2811 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2812 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2813 return new_thread;
2814 }
2815
2816 /*
2817 * If other threads have appeared, shortcut
2818 * around again.
2819 */
2820 if (SCHED(processor_bound_count)(processor)) {
2821 continue;
2822 }
2823 if (processor->is_recommended) {
2824 if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2825 continue;
2826 }
2827 }
2828
2829 pset_lock(pset);
2830 }
2831
2832 idle:
2833 /* Someone selected this processor while we had dropped the lock */
2834 if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2835 (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2836 goto restart;
2837 }
2838
2839 if ((idle_reason == 0) && current_thread_can_keep_running) {
2840 /* This thread is the only runnable (non-idle) thread */
2841 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2842 processor->deadline = thread->realtime.deadline;
2843 } else {
2844 processor->deadline = RT_DEADLINE_NONE;
2845 }
2846
2847 sched_update_pset_load_average(pset, 0);
2848
2849 clear_pending_AST_bits(pset, processor, 6);
2850
2851 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2852 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2853 pset_unlock(pset);
2854 return thread;
2855 }
2856
2857 /*
2858 * Nothing is runnable, or this processor must be forced idle,
2859 * so set this processor idle if it was running.
2860 */
2861 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2862 pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2863 processor_state_update_idle(processor);
2864 }
2865 pset_update_rt_stealable_state(pset);
2866
2867 clear_pending_AST_bits(pset, processor, 7);
2868
2869 /* Invoked with pset locked, returns with pset unlocked */
2870 SCHED(processor_balance)(processor, pset);
2871
2872 new_thread = processor->idle_thread;
2873 } while (new_thread == THREAD_NULL);
2874
2875 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2876 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2877 return new_thread;
2878 }
2879
2880 /*
2881 * thread_invoke
2882 *
2883 * Called at splsched with neither thread locked.
2884 *
2885 * Perform a context switch and start executing the new thread.
2886 *
2887 * Returns FALSE when the context switch didn't happen.
2888 * The reference to the new thread is still consumed.
2889 *
2890 * "self" is what is currently running on the processor,
2891 * "thread" is the new thread to context switch to
2892 * (which may be the same thread in some cases)
2893 */
2894 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)2895 thread_invoke(
2896 thread_t self,
2897 thread_t thread,
2898 ast_t reason)
2899 {
2900 if (__improbable(get_preemption_level() != 0)) {
2901 int pl = get_preemption_level();
2902 panic("thread_invoke: preemption_level %d, possible cause: %s",
2903 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2904 "blocking while holding a spinlock, or within interrupt context"));
2905 }
2906
2907 thread_continue_t continuation = self->continuation;
2908 void *parameter = self->parameter;
2909
2910 struct recount_snap snap = { 0 };
2911 recount_snapshot(&snap);
2912 uint64_t ctime = snap.rsn_time_mach;
2913
2914 check_monotonic_time(ctime);
2915
2916 #ifdef CONFIG_MACH_APPROXIMATE_TIME
2917 commpage_update_mach_approximate_time(ctime);
2918 #endif
2919
2920 if (ctime < thread->last_made_runnable_time) {
2921 panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
2922 ctime, thread->last_made_runnable_time);
2923 }
2924
2925 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
2926 if (!((thread->state & TH_IDLE) != 0 ||
2927 ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
2928 sched_timeshare_consider_maintenance(ctime);
2929 }
2930 #endif
2931
2932 recount_log_switch_thread(&snap);
2933
2934 assert_thread_magic(self);
2935 assert(self == current_thread());
2936 assert(self->runq == PROCESSOR_NULL);
2937 assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2938
2939 thread_lock(thread);
2940
2941 assert_thread_magic(thread);
2942 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
2943 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2944 assert(thread->runq == PROCESSOR_NULL);
2945
2946 /* Update SFI class based on other factors */
2947 thread->sfi_class = sfi_thread_classify(thread);
2948
2949 /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
2950 thread->same_pri_latency = ctime - thread->last_basepri_change_time;
2951 /*
2952 * In case a base_pri update happened between the timestamp and
2953 * taking the thread lock
2954 */
2955 if (ctime <= thread->last_basepri_change_time) {
2956 thread->same_pri_latency = ctime - thread->last_made_runnable_time;
2957 }
2958
2959 /* Allow realtime threads to hang onto a stack. */
2960 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
2961 self->reserved_stack = self->kernel_stack;
2962 }
2963
2964 /* Prepare for spin debugging */
2965 #if SCHED_HYGIENE_DEBUG
2966 ml_spin_debug_clear(thread);
2967 #endif
2968
2969 if (continuation != NULL) {
2970 if (!thread->kernel_stack) {
2971 /*
2972 * If we are using a privileged stack,
2973 * check to see whether we can exchange it with
2974 * that of the other thread.
2975 */
2976 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
2977 goto need_stack;
2978 }
2979
2980 /*
2981 * Context switch by performing a stack handoff.
2982 * Requires both threads to be parked in a continuation.
2983 */
2984 continuation = thread->continuation;
2985 parameter = thread->parameter;
2986
2987 processor_t processor = current_processor();
2988 processor->active_thread = thread;
2989 processor_state_update_from_thread(processor, thread, false);
2990
2991 if (thread->last_processor != processor && thread->last_processor != NULL) {
2992 if (thread->last_processor->processor_set != processor->processor_set) {
2993 thread->ps_switch++;
2994 }
2995 thread->p_switch++;
2996 }
2997 thread->last_processor = processor;
2998 thread->c_switch++;
2999 ast_context(thread);
3000
3001 thread_unlock(thread);
3002
3003 self->reason = reason;
3004
3005 processor->last_dispatch = ctime;
3006 self->last_run_time = ctime;
3007 timer_update(&thread->runnable_timer, ctime);
3008 recount_switch_thread(&snap, self, get_threadtask(self));
3009
3010 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3011 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
3012 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3013
3014 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3015 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3016 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3017 }
3018
3019 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3020
3021 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3022
3023 #if KPERF
3024 kperf_off_cpu(self);
3025 #endif /* KPERF */
3026
3027 /*
3028 * This is where we actually switch thread identity,
3029 * and address space if required. However, register
3030 * state is not switched - this routine leaves the
3031 * stack and register state active on the current CPU.
3032 */
3033 TLOG(1, "thread_invoke: calling stack_handoff\n");
3034 stack_handoff(self, thread);
3035
3036 /* 'self' is now off core */
3037 assert(thread == current_thread_volatile());
3038
3039 DTRACE_SCHED(on__cpu);
3040
3041 #if KPERF
3042 kperf_on_cpu(thread, continuation, NULL);
3043 #endif /* KPERF */
3044
3045 thread_dispatch(self, thread);
3046
3047 #if KASAN
3048 /* Old thread's stack has been moved to the new thread, so explicitly
3049 * unpoison it. */
3050 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3051 #endif
3052
3053 thread->continuation = thread->parameter = NULL;
3054
3055 boolean_t enable_interrupts = TRUE;
3056
3057 /* idle thread needs to stay interrupts-disabled */
3058 if ((thread->state & TH_IDLE)) {
3059 enable_interrupts = FALSE;
3060 }
3061
3062 assert(continuation);
3063 call_continuation(continuation, parameter,
3064 thread->wait_result, enable_interrupts);
3065 /*NOTREACHED*/
3066 } else if (thread == self) {
3067 /* same thread but with continuation */
3068 ast_context(self);
3069
3070 thread_unlock(self);
3071
3072 #if KPERF
3073 kperf_on_cpu(thread, continuation, NULL);
3074 #endif /* KPERF */
3075
3076 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3077 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3078 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3079
3080 #if KASAN
3081 /* stack handoff to self - no thread_dispatch(), so clear the stack
3082 * and free the fakestack directly */
3083 #if KASAN_CLASSIC
3084 kasan_fakestack_drop(self);
3085 kasan_fakestack_gc(self);
3086 #endif /* KASAN_CLASSIC */
3087 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3088 #endif /* KASAN */
3089
3090 self->continuation = self->parameter = NULL;
3091
3092 boolean_t enable_interrupts = TRUE;
3093
3094 /* idle thread needs to stay interrupts-disabled */
3095 if ((self->state & TH_IDLE)) {
3096 enable_interrupts = FALSE;
3097 }
3098
3099 call_continuation(continuation, parameter,
3100 self->wait_result, enable_interrupts);
3101 /*NOTREACHED*/
3102 }
3103 } else {
3104 /*
3105 * Check that the other thread has a stack
3106 */
3107 if (!thread->kernel_stack) {
3108 need_stack:
3109 if (!stack_alloc_try(thread)) {
3110 thread_unlock(thread);
3111 thread_stack_enqueue(thread);
3112 return FALSE;
3113 }
3114 } else if (thread == self) {
3115 ast_context(self);
3116 thread_unlock(self);
3117
3118 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3119 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3120 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3121
3122 return TRUE;
3123 }
3124 }
3125
3126 /*
3127 * Context switch by full context save.
3128 */
3129 processor_t processor = current_processor();
3130 processor->active_thread = thread;
3131 processor_state_update_from_thread(processor, thread, false);
3132
3133 if (thread->last_processor != processor && thread->last_processor != NULL) {
3134 if (thread->last_processor->processor_set != processor->processor_set) {
3135 thread->ps_switch++;
3136 }
3137 thread->p_switch++;
3138 }
3139 thread->last_processor = processor;
3140 thread->c_switch++;
3141 ast_context(thread);
3142
3143 thread_unlock(thread);
3144
3145 self->reason = reason;
3146
3147 processor->last_dispatch = ctime;
3148 self->last_run_time = ctime;
3149 timer_update(&thread->runnable_timer, ctime);
3150 recount_switch_thread(&snap, self, get_threadtask(self));
3151
3152 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3153 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3154 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3155
3156 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3157 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3158 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3159 }
3160
3161 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3162
3163 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3164
3165 #if KPERF
3166 kperf_off_cpu(self);
3167 #endif /* KPERF */
3168
3169 /*
3170 * This is where we actually switch register context,
3171 * and address space if required. We will next run
3172 * as a result of a subsequent context switch.
3173 *
3174 * Once registers are switched and the processor is running "thread",
3175 * the stack variables and non-volatile registers will contain whatever
3176 * was there the last time that thread blocked. No local variables should
3177 * be used after this point, except for the special case of "thread", which
3178 * the platform layer returns as the previous thread running on the processor
3179 * via the function call ABI as a return register, and "self", which may have
3180 * been stored on the stack or a non-volatile register, but a stale idea of
3181 * what was on the CPU is newly-accurate because that thread is again
3182 * running on the CPU.
3183 *
3184 * If one of the threads is using a continuation, thread_continue
3185 * is used to stitch up its context.
3186 *
3187 * If we are invoking a thread which is resuming from a continuation,
3188 * the CPU will invoke thread_continue next.
3189 *
3190 * If the current thread is parking in a continuation, then its state
3191 * won't be saved and the stack will be discarded. When the stack is
3192 * re-allocated, it will be configured to resume from thread_continue.
3193 */
3194
3195 assert(continuation == self->continuation);
3196 thread = machine_switch_context(self, continuation, thread);
3197 assert(self == current_thread_volatile());
3198 TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3199
3200 assert(continuation == NULL && self->continuation == NULL);
3201
3202 DTRACE_SCHED(on__cpu);
3203
3204 #if KPERF
3205 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3206 #endif /* KPERF */
3207
3208 /* We have been resumed and are set to run. */
3209 thread_dispatch(thread, self);
3210
3211 return TRUE;
3212 }
3213
3214 #if defined(CONFIG_SCHED_DEFERRED_AST)
3215 /*
3216 * pset_cancel_deferred_dispatch:
3217 *
3218 * Cancels all ASTs that we can cancel for the given processor set
3219 * if the current processor is running the last runnable thread in the
3220 * system.
3221 *
3222 * This function assumes the current thread is runnable. This must
3223 * be called with the pset unlocked.
3224 */
3225 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3226 pset_cancel_deferred_dispatch(
3227 processor_set_t pset,
3228 processor_t processor)
3229 {
3230 processor_t active_processor = NULL;
3231 uint32_t sampled_sched_run_count;
3232
3233 pset_lock(pset);
3234 sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3235
3236 /*
3237 * If we have emptied the run queue, and our current thread is runnable, we
3238 * should tell any processors that are still DISPATCHING that they will
3239 * probably not have any work to do. In the event that there are no
3240 * pending signals that we can cancel, this is also uninteresting.
3241 *
3242 * In the unlikely event that another thread becomes runnable while we are
3243 * doing this (sched_run_count is atomically updated, not guarded), the
3244 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3245 * in order to dispatch it to a processor in our pset. So, the other
3246 * codepath will wait while we squash all cancelable ASTs, get the pset
3247 * lock, and then dispatch the freshly runnable thread. So this should be
3248 * correct (we won't accidentally have a runnable thread that hasn't been
3249 * dispatched to an idle processor), if not ideal (we may be restarting the
3250 * dispatch process, which could have some overhead).
3251 */
3252
3253 if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3254 uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3255 pset->pending_deferred_AST_cpu_mask &
3256 ~pset->pending_AST_URGENT_cpu_mask);
3257 for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3258 active_processor = processor_array[cpuid];
3259 /*
3260 * If a processor is DISPATCHING, it could be because of
3261 * a cancelable signal.
3262 *
3263 * IF the processor is not our
3264 * current processor (the current processor should not
3265 * be DISPATCHING, so this is a bit paranoid), AND there
3266 * is a cancelable signal pending on the processor, AND
3267 * there is no non-cancelable signal pending (as there is
3268 * no point trying to backtrack on bringing the processor
3269 * up if a signal we cannot cancel is outstanding), THEN
3270 * it should make sense to roll back the processor state
3271 * to the IDLE state.
3272 *
3273 * If the racey nature of this approach (as the signal
3274 * will be arbitrated by hardware, and can fire as we
3275 * roll back state) results in the core responding
3276 * despite being pushed back to the IDLE state, it
3277 * should be no different than if the core took some
3278 * interrupt while IDLE.
3279 */
3280 if (active_processor != processor) {
3281 /*
3282 * Squash all of the processor state back to some
3283 * reasonable facsimile of PROCESSOR_IDLE.
3284 */
3285
3286 processor_state_update_idle(active_processor);
3287 active_processor->deadline = RT_DEADLINE_NONE;
3288 pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3289 bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3290 machine_signal_idle_cancel(active_processor);
3291 }
3292 }
3293 }
3294
3295 pset_unlock(pset);
3296 }
3297 #else
3298 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3299 #endif
3300
3301 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3302 thread_csw_callout(
3303 thread_t old,
3304 thread_t new,
3305 uint64_t timestamp)
3306 {
3307 perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3308 uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3309 machine_switch_perfcontrol_context(event, timestamp, 0,
3310 same_pri_latency, old, new);
3311 }
3312
3313
3314 /*
3315 * thread_dispatch:
3316 *
3317 * Handle threads at context switch. Re-dispatch other thread
3318 * if still running, otherwise update run state and perform
3319 * special actions. Update quantum for other thread and begin
3320 * the quantum for ourselves.
3321 *
3322 * "thread" is the old thread that we have switched away from.
3323 * "self" is the new current thread that we have context switched to
3324 *
3325 * Called at splsched.
3326 *
3327 */
3328 void
thread_dispatch(thread_t thread,thread_t self)3329 thread_dispatch(
3330 thread_t thread,
3331 thread_t self)
3332 {
3333 processor_t processor = self->last_processor;
3334 bool was_idle = false;
3335
3336 assert(processor == current_processor());
3337 assert(self == current_thread_volatile());
3338 assert(thread != self);
3339
3340 if (thread != THREAD_NULL) {
3341 /*
3342 * Do the perfcontrol callout for context switch.
3343 * The reason we do this here is:
3344 * - thread_dispatch() is called from various places that are not
3345 * the direct context switch path for eg. processor shutdown etc.
3346 * So adding the callout here covers all those cases.
3347 * - We want this callout as early as possible to be close
3348 * to the timestamp taken in thread_invoke()
3349 * - We want to avoid holding the thread lock while doing the
3350 * callout
3351 * - We do not want to callout if "thread" is NULL.
3352 */
3353 thread_csw_callout(thread, self, processor->last_dispatch);
3354
3355 #if KASAN
3356 if (thread->continuation != NULL) {
3357 /*
3358 * Thread has a continuation and the normal stack is going away.
3359 * Unpoison the stack and mark all fakestack objects as unused.
3360 */
3361 #if KASAN_CLASSIC
3362 kasan_fakestack_drop(thread);
3363 #endif /* KASAN_CLASSIC */
3364 if (thread->kernel_stack) {
3365 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3366 }
3367 }
3368
3369
3370 #if KASAN_CLASSIC
3371 /*
3372 * Free all unused fakestack objects.
3373 */
3374 kasan_fakestack_gc(thread);
3375 #endif /* KASAN_CLASSIC */
3376 #endif /* KASAN */
3377
3378 /*
3379 * If blocked at a continuation, discard
3380 * the stack.
3381 */
3382 if (thread->continuation != NULL && thread->kernel_stack != 0) {
3383 stack_free(thread);
3384 }
3385
3386 if (thread->state & TH_IDLE) {
3387 was_idle = true;
3388 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3389 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3390 (uintptr_t)thread_tid(thread), 0, thread->state,
3391 sched_run_buckets[TH_BUCKET_RUN], 0);
3392 } else {
3393 int64_t consumed;
3394 int64_t remainder = 0;
3395
3396 if (processor->quantum_end > processor->last_dispatch) {
3397 remainder = processor->quantum_end -
3398 processor->last_dispatch;
3399 }
3400
3401 consumed = thread->quantum_remaining - remainder;
3402
3403 if ((thread->reason & AST_LEDGER) == 0) {
3404 /*
3405 * Bill CPU time to both the task and
3406 * the individual thread.
3407 */
3408 ledger_credit_thread(thread, thread->t_ledger,
3409 task_ledgers.cpu_time, consumed);
3410 ledger_credit_thread(thread, thread->t_threadledger,
3411 thread_ledgers.cpu_time, consumed);
3412 if (thread->t_bankledger) {
3413 ledger_credit_thread(thread, thread->t_bankledger,
3414 bank_ledgers.cpu_time,
3415 (consumed - thread->t_deduct_bank_ledger_time));
3416 }
3417 thread->t_deduct_bank_ledger_time = 0;
3418 if (consumed > 0) {
3419 /*
3420 * This should never be negative, but in traces we are seeing some instances
3421 * of consumed being negative.
3422 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3423 */
3424 sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3425 }
3426 }
3427
3428 /* For the thread that we just context switched away from, figure
3429 * out if we have expired the wq quantum and set the AST if we have
3430 */
3431 if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3432 thread_evaluate_workqueue_quantum_expiry(thread);
3433 }
3434
3435 /*
3436 * Pairs with task_restartable_ranges_synchronize
3437 */
3438 wake_lock(thread);
3439 thread_lock(thread);
3440
3441 /*
3442 * Same as ast_check(), in case we missed the IPI
3443 */
3444 thread_reset_pcs_ack_IPI(thread);
3445
3446 /*
3447 * Apply a priority floor if the thread holds a kernel resource
3448 * or explicitly requested it.
3449 * Do this before checking starting_pri to avoid overpenalizing
3450 * repeated rwlock blockers.
3451 */
3452 if (__improbable(thread->rwlock_count != 0)) {
3453 lck_rw_set_promotion_locked(thread);
3454 }
3455 if (__improbable(thread->priority_floor_count != 0)) {
3456 thread_floor_boost_set_promotion_locked(thread);
3457 }
3458
3459 boolean_t keep_quantum = processor->first_timeslice;
3460
3461 /*
3462 * Treat a thread which has dropped priority since it got on core
3463 * as having expired its quantum.
3464 */
3465 if (processor->starting_pri > thread->sched_pri) {
3466 keep_quantum = FALSE;
3467 }
3468
3469 /* Compute remainder of current quantum. */
3470 if (keep_quantum &&
3471 processor->quantum_end > processor->last_dispatch) {
3472 thread->quantum_remaining = (uint32_t)remainder;
3473 } else {
3474 thread->quantum_remaining = 0;
3475 }
3476
3477 if (thread->sched_mode == TH_MODE_REALTIME) {
3478 /*
3479 * Cancel the deadline if the thread has
3480 * consumed the entire quantum.
3481 */
3482 if (thread->quantum_remaining == 0) {
3483 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3484 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3485 thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3486 }
3487 } else {
3488 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3489 /*
3490 * For non-realtime threads treat a tiny
3491 * remaining quantum as an expired quantum
3492 * but include what's left next time.
3493 */
3494 if (thread->quantum_remaining < min_std_quantum) {
3495 thread->reason |= AST_QUANTUM;
3496 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3497 }
3498 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3499 }
3500
3501 /*
3502 * If we are doing a direct handoff then
3503 * take the remainder of the quantum.
3504 */
3505 if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3506 self->quantum_remaining = thread->quantum_remaining;
3507 thread->reason |= AST_QUANTUM;
3508 thread->quantum_remaining = 0;
3509 } else {
3510 #if defined(CONFIG_SCHED_MULTIQ)
3511 if (SCHED(sched_groups_enabled) &&
3512 thread->sched_group == self->sched_group) {
3513 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3514 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3515 self->reason, (uintptr_t)thread_tid(thread),
3516 self->quantum_remaining, thread->quantum_remaining, 0);
3517
3518 self->quantum_remaining = thread->quantum_remaining;
3519 thread->quantum_remaining = 0;
3520 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
3521 }
3522 #endif /* defined(CONFIG_SCHED_MULTIQ) */
3523 }
3524
3525 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3526
3527 if (!(thread->state & TH_WAIT)) {
3528 /*
3529 * Still runnable.
3530 */
3531 thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3532
3533 machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3534
3535 ast_t reason = thread->reason;
3536 sched_options_t options = SCHED_NONE;
3537
3538 if (reason & AST_REBALANCE) {
3539 options |= SCHED_REBALANCE;
3540 if (reason & AST_QUANTUM) {
3541 /*
3542 * Having gone to the trouble of forcing this thread off a less preferred core,
3543 * we should force the preferable core to reschedule immediately to give this
3544 * thread a chance to run instead of just sitting on the run queue where
3545 * it may just be stolen back by the idle core we just forced it off.
3546 * But only do this at the end of a quantum to prevent cascading effects.
3547 */
3548 options |= SCHED_PREEMPT;
3549 }
3550 }
3551
3552 if (reason & AST_QUANTUM) {
3553 options |= SCHED_TAILQ;
3554 } else if (reason & AST_PREEMPT) {
3555 options |= SCHED_HEADQ;
3556 } else {
3557 options |= (SCHED_PREEMPT | SCHED_TAILQ);
3558 }
3559
3560 thread_setrun(thread, options);
3561
3562 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3563 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3564 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3565 sched_run_buckets[TH_BUCKET_RUN], 0);
3566
3567 if (thread->wake_active) {
3568 thread->wake_active = FALSE;
3569 thread_unlock(thread);
3570
3571 thread_wakeup(&thread->wake_active);
3572 } else {
3573 thread_unlock(thread);
3574 }
3575
3576 wake_unlock(thread);
3577 } else {
3578 /*
3579 * Waiting.
3580 */
3581 boolean_t should_terminate = FALSE;
3582 uint32_t new_run_count;
3583 int thread_state = thread->state;
3584
3585 /* Only the first call to thread_dispatch
3586 * after explicit termination should add
3587 * the thread to the termination queue
3588 */
3589 if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3590 should_terminate = TRUE;
3591 thread_state |= TH_TERMINATE2;
3592 }
3593
3594 timer_stop(&thread->runnable_timer, processor->last_dispatch);
3595
3596 thread_state &= ~TH_RUN;
3597 thread->state = thread_state;
3598
3599 thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3600 thread->chosen_processor = PROCESSOR_NULL;
3601
3602 new_run_count = SCHED(run_count_decr)(thread);
3603
3604 #if CONFIG_SCHED_AUTO_JOIN
3605 if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3606 work_interval_auto_join_unwind(thread);
3607 }
3608 #endif /* CONFIG_SCHED_AUTO_JOIN */
3609
3610 #if CONFIG_SCHED_SFI
3611 if (thread->reason & AST_SFI) {
3612 thread->wait_sfi_begin_time = processor->last_dispatch;
3613 }
3614 #endif
3615 machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3616
3617 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3618 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3619 (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3620 new_run_count, 0);
3621
3622 if (thread_state & TH_WAIT_REPORT) {
3623 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3624 }
3625
3626 if (thread->wake_active) {
3627 thread->wake_active = FALSE;
3628 thread_unlock(thread);
3629
3630 thread_wakeup(&thread->wake_active);
3631 } else {
3632 thread_unlock(thread);
3633 }
3634
3635 wake_unlock(thread);
3636
3637 if (should_terminate) {
3638 thread_terminate_enqueue(thread);
3639 }
3640 }
3641 }
3642 /*
3643 * The thread could have been added to the termination queue, so it's
3644 * unsafe to use after this point.
3645 */
3646 thread = THREAD_NULL;
3647 }
3648
3649 int urgency = THREAD_URGENCY_NONE;
3650 uint64_t latency = 0;
3651
3652 /* Update (new) current thread and reprogram running timers */
3653 thread_lock(self);
3654
3655 if (!(self->state & TH_IDLE)) {
3656 uint64_t arg1, arg2;
3657
3658 #if CONFIG_SCHED_SFI
3659 ast_t new_ast;
3660
3661 new_ast = sfi_thread_needs_ast(self, NULL);
3662
3663 if (new_ast != AST_NONE) {
3664 ast_on(new_ast);
3665 }
3666 #endif
3667
3668 if (processor->last_dispatch < self->last_made_runnable_time) {
3669 panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3670 processor->last_dispatch, self->last_made_runnable_time);
3671 }
3672
3673 assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3674
3675 latency = processor->last_dispatch - self->last_made_runnable_time;
3676 assert(latency >= self->same_pri_latency);
3677
3678 urgency = thread_get_urgency(self, &arg1, &arg2);
3679
3680 thread_tell_urgency(urgency, arg1, arg2, latency, self);
3681
3682 /*
3683 * Get a new quantum if none remaining.
3684 */
3685 if (self->quantum_remaining == 0) {
3686 thread_quantum_init(self);
3687 }
3688
3689 /*
3690 * Set up quantum timer and timeslice.
3691 */
3692 processor->quantum_end = processor->last_dispatch +
3693 self->quantum_remaining;
3694
3695 running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
3696 processor->quantum_end, processor->last_dispatch);
3697 if (was_idle) {
3698 /*
3699 * kperf's running timer is active whenever the idle thread for a
3700 * CPU is not running.
3701 */
3702 kperf_running_setup(processor, processor->last_dispatch);
3703 }
3704 running_timers_activate(processor);
3705 processor->first_timeslice = TRUE;
3706 } else {
3707 running_timers_deactivate(processor);
3708 processor->first_timeslice = FALSE;
3709 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
3710 }
3711
3712 assert(self->block_hint == kThreadWaitNone);
3713 self->computation_epoch = processor->last_dispatch;
3714 self->reason = AST_NONE;
3715 processor->starting_pri = self->sched_pri;
3716
3717 thread_unlock(self);
3718
3719 machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
3720 processor->last_dispatch);
3721
3722 #if defined(CONFIG_SCHED_DEFERRED_AST)
3723 /*
3724 * TODO: Can we state that redispatching our old thread is also
3725 * uninteresting?
3726 */
3727 if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3728 pset_cancel_deferred_dispatch(processor->processor_set, processor);
3729 }
3730 #endif
3731 }
3732
3733 /*
3734 * thread_block_reason:
3735 *
3736 * Forces a reschedule, blocking the caller if a wait
3737 * has been asserted.
3738 *
3739 * If a continuation is specified, then thread_invoke will
3740 * attempt to discard the thread's kernel stack. When the
3741 * thread resumes, it will execute the continuation function
3742 * on a new kernel stack.
3743 */
3744 wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)3745 thread_block_reason(
3746 thread_continue_t continuation,
3747 void *parameter,
3748 ast_t reason)
3749 {
3750 thread_t self = current_thread();
3751 processor_t processor;
3752 thread_t new_thread;
3753 spl_t s;
3754
3755 s = splsched();
3756
3757 processor = current_processor();
3758
3759 /* If we're explicitly yielding, force a subsequent quantum */
3760 if (reason & AST_YIELD) {
3761 processor->first_timeslice = FALSE;
3762 }
3763
3764 /* We're handling all scheduling AST's */
3765 ast_off(AST_SCHEDULING);
3766
3767 #if PROC_REF_DEBUG
3768 if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3769 uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3770 }
3771 #endif
3772
3773 self->continuation = continuation;
3774 self->parameter = parameter;
3775
3776 if (self->state & ~(TH_RUN | TH_IDLE)) {
3777 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3778 MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3779 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3780 }
3781
3782 do {
3783 thread_lock(self);
3784 new_thread = thread_select(self, processor, &reason);
3785 thread_unlock(self);
3786 } while (!thread_invoke(self, new_thread, reason));
3787
3788 splx(s);
3789
3790 return self->wait_result;
3791 }
3792
3793 /*
3794 * thread_block:
3795 *
3796 * Block the current thread if a wait has been asserted.
3797 */
3798 wait_result_t
thread_block(thread_continue_t continuation)3799 thread_block(
3800 thread_continue_t continuation)
3801 {
3802 return thread_block_reason(continuation, NULL, AST_NONE);
3803 }
3804
3805 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)3806 thread_block_parameter(
3807 thread_continue_t continuation,
3808 void *parameter)
3809 {
3810 return thread_block_reason(continuation, parameter, AST_NONE);
3811 }
3812
3813 /*
3814 * thread_run:
3815 *
3816 * Switch directly from the current thread to the
3817 * new thread, handing off our quantum if appropriate.
3818 *
3819 * New thread must be runnable, and not on a run queue.
3820 *
3821 * Called at splsched.
3822 */
3823 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)3824 thread_run(
3825 thread_t self,
3826 thread_continue_t continuation,
3827 void *parameter,
3828 thread_t new_thread)
3829 {
3830 ast_t reason = AST_NONE;
3831
3832 if ((self->state & TH_IDLE) == 0) {
3833 reason = AST_HANDOFF;
3834 }
3835
3836 /*
3837 * If this thread hadn't been setrun'ed, it
3838 * might not have a chosen processor, so give it one
3839 */
3840 if (new_thread->chosen_processor == NULL) {
3841 new_thread->chosen_processor = current_processor();
3842 }
3843
3844 self->continuation = continuation;
3845 self->parameter = parameter;
3846
3847 while (!thread_invoke(self, new_thread, reason)) {
3848 /* the handoff failed, so we have to fall back to the normal block path */
3849 processor_t processor = current_processor();
3850
3851 reason = AST_NONE;
3852
3853 thread_lock(self);
3854 new_thread = thread_select(self, processor, &reason);
3855 thread_unlock(self);
3856 }
3857
3858 return self->wait_result;
3859 }
3860
3861 /*
3862 * thread_continue:
3863 *
3864 * Called at splsched when a thread first receives
3865 * a new stack after a continuation.
3866 *
3867 * Called with THREAD_NULL as the old thread when
3868 * invoked by machine_load_context.
3869 */
3870 void
thread_continue(thread_t thread)3871 thread_continue(
3872 thread_t thread)
3873 {
3874 thread_t self = current_thread();
3875 thread_continue_t continuation;
3876 void *parameter;
3877
3878 DTRACE_SCHED(on__cpu);
3879
3880 continuation = self->continuation;
3881 parameter = self->parameter;
3882
3883 assert(continuation != NULL);
3884
3885 #if KPERF
3886 kperf_on_cpu(self, continuation, NULL);
3887 #endif
3888
3889 thread_dispatch(thread, self);
3890
3891 self->continuation = self->parameter = NULL;
3892
3893 #if SCHED_HYGIENE_DEBUG
3894 /* Reset interrupt-masked spin debugging timeout */
3895 ml_spin_debug_clear(self);
3896 #endif
3897
3898 TLOG(1, "thread_continue: calling call_continuation\n");
3899
3900 boolean_t enable_interrupts = TRUE;
3901
3902 /* bootstrap thread, idle thread need to stay interrupts-disabled */
3903 if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
3904 enable_interrupts = FALSE;
3905 }
3906
3907 #if CONFIG_KERNEL_TBI && KASAN
3908 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3909 #endif /* CONFIG_KERNEL_TBI && KASAN */
3910
3911
3912 call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
3913 /*NOTREACHED*/
3914 }
3915
3916 void
thread_quantum_init(thread_t thread)3917 thread_quantum_init(thread_t thread)
3918 {
3919 if (thread->sched_mode == TH_MODE_REALTIME) {
3920 thread->quantum_remaining = thread->realtime.computation;
3921 } else {
3922 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
3923 }
3924 }
3925
3926 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)3927 sched_timeshare_initial_quantum_size(thread_t thread)
3928 {
3929 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
3930 return bg_quantum;
3931 } else {
3932 return std_quantum;
3933 }
3934 }
3935
3936 /*
3937 * run_queue_init:
3938 *
3939 * Initialize a run queue before first use.
3940 */
3941 void
run_queue_init(run_queue_t rq)3942 run_queue_init(
3943 run_queue_t rq)
3944 {
3945 rq->highq = NOPRI;
3946 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
3947 rq->bitmap[i] = 0;
3948 }
3949 rq->urgency = rq->count = 0;
3950 for (int i = 0; i < NRQS; i++) {
3951 circle_queue_init(&rq->queues[i]);
3952 }
3953 }
3954
3955 /*
3956 * run_queue_dequeue:
3957 *
3958 * Perform a dequeue operation on a run queue,
3959 * and return the resulting thread.
3960 *
3961 * The run queue must be locked (see thread_run_queue_remove()
3962 * for more info), and not empty.
3963 */
3964 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)3965 run_queue_dequeue(
3966 run_queue_t rq,
3967 sched_options_t options)
3968 {
3969 thread_t thread;
3970 circle_queue_t queue = &rq->queues[rq->highq];
3971
3972 if (options & SCHED_HEADQ) {
3973 thread = cqe_dequeue_head(queue, struct thread, runq_links);
3974 } else {
3975 thread = cqe_dequeue_tail(queue, struct thread, runq_links);
3976 }
3977
3978 assert(thread != THREAD_NULL);
3979 assert_thread_magic(thread);
3980
3981 thread->runq = PROCESSOR_NULL;
3982 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3983 rq->count--;
3984 if (SCHED(priority_is_urgent)(rq->highq)) {
3985 rq->urgency--; assert(rq->urgency >= 0);
3986 }
3987 if (circle_queue_empty(queue)) {
3988 bitmap_clear(rq->bitmap, rq->highq);
3989 rq->highq = bitmap_first(rq->bitmap, NRQS);
3990 }
3991
3992 return thread;
3993 }
3994
3995 /*
3996 * run_queue_enqueue:
3997 *
3998 * Perform a enqueue operation on a run queue.
3999 *
4000 * The run queue must be locked (see thread_run_queue_remove()
4001 * for more info).
4002 */
4003 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)4004 run_queue_enqueue(
4005 run_queue_t rq,
4006 thread_t thread,
4007 sched_options_t options)
4008 {
4009 circle_queue_t queue = &rq->queues[thread->sched_pri];
4010 boolean_t result = FALSE;
4011
4012 assert_thread_magic(thread);
4013
4014 if (circle_queue_empty(queue)) {
4015 circle_enqueue_tail(queue, &thread->runq_links);
4016
4017 rq_bitmap_set(rq->bitmap, thread->sched_pri);
4018 if (thread->sched_pri > rq->highq) {
4019 rq->highq = thread->sched_pri;
4020 result = TRUE;
4021 }
4022 } else {
4023 if (options & SCHED_TAILQ) {
4024 circle_enqueue_tail(queue, &thread->runq_links);
4025 } else {
4026 circle_enqueue_head(queue, &thread->runq_links);
4027 }
4028 }
4029 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4030 rq->urgency++;
4031 }
4032 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4033 rq->count++;
4034
4035 return result;
4036 }
4037
4038 /*
4039 * run_queue_remove:
4040 *
4041 * Remove a specific thread from a runqueue.
4042 *
4043 * The run queue must be locked.
4044 */
4045 void
run_queue_remove(run_queue_t rq,thread_t thread)4046 run_queue_remove(
4047 run_queue_t rq,
4048 thread_t thread)
4049 {
4050 circle_queue_t queue = &rq->queues[thread->sched_pri];
4051
4052 assert(thread->runq != PROCESSOR_NULL);
4053 assert_thread_magic(thread);
4054
4055 circle_dequeue(queue, &thread->runq_links);
4056 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4057 rq->count--;
4058 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4059 rq->urgency--; assert(rq->urgency >= 0);
4060 }
4061
4062 if (circle_queue_empty(queue)) {
4063 /* update run queue status */
4064 bitmap_clear(rq->bitmap, thread->sched_pri);
4065 rq->highq = bitmap_first(rq->bitmap, NRQS);
4066 }
4067
4068 thread->runq = PROCESSOR_NULL;
4069 }
4070
4071 /*
4072 * run_queue_peek
4073 *
4074 * Peek at the runq and return the highest
4075 * priority thread from the runq.
4076 *
4077 * The run queue must be locked.
4078 */
4079 thread_t
run_queue_peek(run_queue_t rq)4080 run_queue_peek(
4081 run_queue_t rq)
4082 {
4083 if (rq->count > 0) {
4084 circle_queue_t queue = &rq->queues[rq->highq];
4085 thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4086 assert_thread_magic(thread);
4087 return thread;
4088 } else {
4089 return THREAD_NULL;
4090 }
4091 }
4092
4093 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)4094 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4095 {
4096 int pri = thread->sched_pri;
4097 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4098 int i = pri - BASEPRI_RTQUEUES;
4099 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4100 bitmap_t *map = rt_run_queue->bitmap;
4101
4102 bitmap_set(map, i);
4103
4104 queue_t queue = &rt_runq->pri_queue;
4105 uint64_t deadline = thread->realtime.deadline;
4106 bool preempt = false;
4107 bool earliest = false;
4108
4109 if (queue_empty(queue)) {
4110 enqueue_tail(queue, &thread->runq_links);
4111 preempt = true;
4112 earliest = true;
4113 rt_runq->pri_earliest_deadline = deadline;
4114 rt_runq->pri_constraint = thread->realtime.constraint;
4115 } else {
4116 /* Insert into rt_runq in thread deadline order */
4117 queue_entry_t iter;
4118 qe_foreach(iter, queue) {
4119 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4120 assert_thread_magic(iter_thread);
4121
4122 if (deadline < iter_thread->realtime.deadline) {
4123 if (iter == queue_first(queue)) {
4124 preempt = true;
4125 earliest = true;
4126 rt_runq->pri_earliest_deadline = deadline;
4127 rt_runq->pri_constraint = thread->realtime.constraint;
4128 }
4129 insque(&thread->runq_links, queue_prev(iter));
4130 break;
4131 } else if (iter == queue_last(queue)) {
4132 enqueue_tail(queue, &thread->runq_links);
4133 break;
4134 }
4135 }
4136 }
4137 if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4138 os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4139 os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4140 os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4141 }
4142
4143 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4144 rt_runq->pri_count++;
4145 os_atomic_inc(&rt_run_queue->count, relaxed);
4146
4147 thread->runq = processor;
4148
4149 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4150
4151 return preempt;
4152 }
4153
4154 static thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)4155 rt_runq_dequeue(rt_queue_t rt_run_queue)
4156 {
4157 bitmap_t *map = rt_run_queue->bitmap;
4158 int i = bitmap_first(map, NRTQS);
4159 assert((i >= 0) && (i < NRTQS));
4160
4161 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4162
4163 if (!sched_rt_runq_strict_priority) {
4164 int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4165 if (ed_index != i) {
4166 assert((ed_index >= 0) && (ed_index < NRTQS));
4167 rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4168
4169 thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4170 thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4171
4172 if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4173 /* choose the earliest deadline thread */
4174 rt_runq = ed_runq;
4175 i = ed_index;
4176 }
4177 }
4178 }
4179
4180 assert(rt_runq->pri_count > 0);
4181 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4182 uint32_t constraint = RT_CONSTRAINT_NONE;
4183 int ed_index = NOPRI;
4184 thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4185 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4186 if (--rt_runq->pri_count > 0) {
4187 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4188 assert(next_rt != THREAD_NULL);
4189 earliest_deadline = next_rt->realtime.deadline;
4190 constraint = next_rt->realtime.constraint;
4191 ed_index = i;
4192 } else {
4193 bitmap_clear(map, i);
4194 }
4195 rt_runq->pri_earliest_deadline = earliest_deadline;
4196 rt_runq->pri_constraint = constraint;
4197
4198 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4199 rt_runq = &rt_run_queue->rt_queue_pri[i];
4200 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4201 earliest_deadline = rt_runq->pri_earliest_deadline;
4202 constraint = rt_runq->pri_constraint;
4203 ed_index = i;
4204 }
4205 }
4206 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4207 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4208 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4209 os_atomic_dec(&rt_run_queue->count, relaxed);
4210
4211 new_thread->runq = PROCESSOR_NULL;
4212
4213 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4214
4215 return new_thread;
4216 }
4217
4218 static thread_t
rt_runq_first(rt_queue_t rt_run_queue)4219 rt_runq_first(rt_queue_t rt_run_queue)
4220 {
4221 bitmap_t *map = rt_run_queue->bitmap;
4222 int i = bitmap_first(map, NRTQS);
4223 if (i < 0) {
4224 return THREAD_NULL;
4225 }
4226 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4227 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4228
4229 return next_rt;
4230 }
4231
4232 static void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)4233 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4234 {
4235 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4236
4237 int pri = thread->sched_pri;
4238 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4239 int i = pri - BASEPRI_RTQUEUES;
4240 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4241 bitmap_t *map = rt_run_queue->bitmap;
4242
4243 assert(rt_runq->pri_count > 0);
4244 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4245 uint32_t constraint = RT_CONSTRAINT_NONE;
4246 int ed_index = NOPRI;
4247 remqueue(&thread->runq_links);
4248 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4249 if (--rt_runq->pri_count > 0) {
4250 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4251 earliest_deadline = next_rt->realtime.deadline;
4252 constraint = next_rt->realtime.constraint;
4253 ed_index = i;
4254 } else {
4255 bitmap_clear(map, i);
4256 }
4257 rt_runq->pri_earliest_deadline = earliest_deadline;
4258 rt_runq->pri_constraint = constraint;
4259
4260 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4261 rt_runq = &rt_run_queue->rt_queue_pri[i];
4262 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4263 earliest_deadline = rt_runq->pri_earliest_deadline;
4264 constraint = rt_runq->pri_constraint;
4265 ed_index = i;
4266 }
4267 }
4268 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4269 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4270 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4271 os_atomic_dec(&rt_run_queue->count, relaxed);
4272
4273 thread->runq = PROCESSOR_NULL;
4274
4275 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4276 }
4277
4278 rt_queue_t
sched_rtlocal_runq(processor_set_t pset)4279 sched_rtlocal_runq(processor_set_t pset)
4280 {
4281 return &pset->rt_runq;
4282 }
4283
4284 void
sched_rtlocal_init(processor_set_t pset)4285 sched_rtlocal_init(processor_set_t pset)
4286 {
4287 pset_rt_init(pset);
4288 }
4289
4290 void
sched_rtlocal_queue_shutdown(processor_t processor)4291 sched_rtlocal_queue_shutdown(processor_t processor)
4292 {
4293 processor_set_t pset = processor->processor_set;
4294 thread_t thread;
4295 queue_head_t tqueue;
4296
4297 pset_lock(pset);
4298
4299 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4300 if (bit_count(pset_available_cpumap(pset)) > 0) {
4301 pset_unlock(pset);
4302 return;
4303 }
4304
4305 queue_init(&tqueue);
4306
4307 while (rt_runq_count(pset) > 0) {
4308 thread = rt_runq_dequeue(&pset->rt_runq);
4309 enqueue_tail(&tqueue, &thread->runq_links);
4310 }
4311 sched_update_pset_load_average(pset, 0);
4312 pset_update_rt_stealable_state(pset);
4313 pset_unlock(pset);
4314
4315 qe_foreach_element_safe(thread, &tqueue, runq_links) {
4316 remqueue(&thread->runq_links);
4317
4318 thread_lock(thread);
4319
4320 thread_setrun(thread, SCHED_TAILQ);
4321
4322 thread_unlock(thread);
4323 }
4324 }
4325
4326 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4327 void
sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)4328 sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4329 {
4330 thread_t thread;
4331
4332 pset_node_t node = &pset_node0;
4333 processor_set_t pset = node->psets;
4334
4335 spl_t s = splsched();
4336 do {
4337 while (pset != NULL) {
4338 pset_lock(pset);
4339
4340 bitmap_t *map = pset->rt_runq.bitmap;
4341 for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4342 rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4343
4344 qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4345 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4346 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4347 }
4348 }
4349 }
4350
4351 pset_unlock(pset);
4352
4353 pset = pset->pset_list;
4354 }
4355 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4356 splx(s);
4357 }
4358
4359 int64_t
sched_rtlocal_runq_count_sum(void)4360 sched_rtlocal_runq_count_sum(void)
4361 {
4362 pset_node_t node = &pset_node0;
4363 processor_set_t pset = node->psets;
4364 int64_t count = 0;
4365
4366 do {
4367 while (pset != NULL) {
4368 count += pset->rt_runq.runq_stats.count_sum;
4369
4370 pset = pset->pset_list;
4371 }
4372 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4373
4374 return count;
4375 }
4376
4377 /*
4378 * Called with stealing_pset locked and
4379 * returns with stealing_pset locked
4380 * but the lock will have been dropped
4381 * if a thread is returned.
4382 */
4383 thread_t
sched_rtlocal_steal_thread(processor_set_t stealing_pset,uint64_t earliest_deadline)4384 sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4385 {
4386 if (!sched_allow_rt_steal) {
4387 return THREAD_NULL;
4388 }
4389 pset_map_t pset_map = stealing_pset->node->pset_map;
4390
4391 bit_clear(pset_map, stealing_pset->pset_id);
4392
4393 processor_set_t pset = stealing_pset;
4394
4395 processor_set_t target_pset;
4396 uint64_t target_deadline;
4397
4398 retry:
4399 target_pset = NULL;
4400 target_deadline = earliest_deadline - rt_deadline_epsilon;
4401
4402 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
4403 processor_set_t nset = pset_array[pset_id];
4404
4405 if (nset->stealable_rt_threads_earliest_deadline < target_deadline) {
4406 target_deadline = nset->stealable_rt_threads_earliest_deadline;
4407 target_pset = nset;
4408 }
4409 }
4410
4411 if (target_pset != NULL) {
4412 pset = change_locked_pset(pset, target_pset);
4413 if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4414 thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
4415 pset_update_rt_stealable_state(pset);
4416 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4417
4418 pset = change_locked_pset(pset, stealing_pset);
4419 return new_thread;
4420 }
4421 pset = change_locked_pset(pset, stealing_pset);
4422 earliest_deadline = rt_runq_earliest_deadline(pset);
4423 goto retry;
4424 }
4425
4426 pset = change_locked_pset(pset, stealing_pset);
4427 return THREAD_NULL;
4428 }
4429
4430 /*
4431 * pset is locked
4432 */
4433 thread_t
sched_rt_choose_thread(processor_set_t pset)4434 sched_rt_choose_thread(processor_set_t pset)
4435 {
4436 processor_t processor = current_processor();
4437
4438 if (SCHED(steal_thread_enabled)(pset)) {
4439 do {
4440 bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4441 if (spill_pending) {
4442 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4443 }
4444 thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4445 if (new_thread != THREAD_NULL) {
4446 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4447 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4448 }
4449 return new_thread;
4450 }
4451 } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4452 }
4453
4454 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4455 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4456 }
4457
4458 if (rt_runq_count(pset) > 0) {
4459 thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4460 assert(new_thread != THREAD_NULL);
4461 pset_update_rt_stealable_state(pset);
4462 return new_thread;
4463 }
4464
4465 return THREAD_NULL;
4466 }
4467
4468 /*
4469 * realtime_queue_insert:
4470 *
4471 * Enqueue a thread for realtime execution.
4472 */
4473 static bool
realtime_queue_insert(processor_t processor,processor_set_t pset,thread_t thread)4474 realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4475 {
4476 pset_assert_locked(pset);
4477
4478 bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4479 pset_update_rt_stealable_state(pset);
4480
4481 return preempt;
4482 }
4483
4484 /*
4485 * realtime_setrun:
4486 *
4487 * Dispatch a thread for realtime execution.
4488 *
4489 * Thread must be locked. Associated pset must
4490 * be locked, and is returned unlocked.
4491 */
4492 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4493 realtime_setrun(
4494 processor_t chosen_processor,
4495 thread_t thread)
4496 {
4497 processor_set_t pset = chosen_processor->processor_set;
4498 pset_assert_locked(pset);
4499 bool pset_is_locked = true;
4500
4501 int n_backup = 0;
4502
4503 if (thread->realtime.constraint <= rt_constraint_threshold) {
4504 n_backup = sched_rt_n_backup_processors;
4505 }
4506 assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4507
4508 int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4509 if (existing_backups > 0) {
4510 n_backup = n_backup - existing_backups;
4511 if (n_backup < 0) {
4512 n_backup = 0;
4513 }
4514 }
4515
4516 sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4517 processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4518
4519 thread->chosen_processor = chosen_processor;
4520
4521 /* <rdar://problem/15102234> */
4522 assert(thread->bound_processor == PROCESSOR_NULL);
4523
4524 realtime_queue_insert(chosen_processor, pset, thread);
4525
4526 processor_t processor = chosen_processor;
4527
4528 int count = 0;
4529 for (int i = 0; i <= n_backup; i++) {
4530 if (i == 0) {
4531 ipi_type[i] = SCHED_IPI_NONE;
4532 ipi_processor[i] = processor;
4533 count++;
4534
4535 ast_t preempt = AST_NONE;
4536 if (thread->sched_pri > processor->current_pri) {
4537 preempt = (AST_PREEMPT | AST_URGENT);
4538 } else if (thread->sched_pri == processor->current_pri) {
4539 if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4540 preempt = (AST_PREEMPT | AST_URGENT);
4541 }
4542 }
4543
4544 if (preempt != AST_NONE) {
4545 if (processor->state == PROCESSOR_IDLE) {
4546 if (processor == current_processor()) {
4547 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4548 ast_on(preempt);
4549
4550 if ((preempt & AST_URGENT) == AST_URGENT) {
4551 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4552 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4553 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4554 }
4555 }
4556
4557 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4558 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4559 }
4560 } else {
4561 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4562 }
4563 } else if (processor->state == PROCESSOR_DISPATCHING) {
4564 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4565 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4566 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4567 }
4568 } else {
4569 if (processor == current_processor()) {
4570 ast_on(preempt);
4571
4572 if ((preempt & AST_URGENT) == AST_URGENT) {
4573 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4574 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4575 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4576 }
4577 }
4578
4579 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4580 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4581 }
4582 } else {
4583 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4584 }
4585 }
4586 } else {
4587 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4588 }
4589 } else {
4590 if (!pset_is_locked) {
4591 pset_lock(pset);
4592 }
4593 ipi_type[i] = SCHED_IPI_NONE;
4594 ipi_processor[i] = PROCESSOR_NULL;
4595 pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]);
4596 if (ipi_processor[i] == PROCESSOR_NULL) {
4597 break;
4598 }
4599 count++;
4600
4601 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4602 ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4603 #if defined(__x86_64__)
4604 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4605 if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4606 processor_t p0 = ipi_processor[0];
4607 processor_t p1 = ipi_processor[1];
4608 assert(p0 && p1);
4609 if (p_is_good(p0) && p_is_good(p1)) {
4610 /*
4611 * Both the chosen processor and the first backup are non-cpu0 primaries,
4612 * so there is no need for a 2nd backup processor.
4613 */
4614 break;
4615 }
4616 }
4617 #endif
4618 }
4619 }
4620
4621 if (pset_is_locked) {
4622 pset_unlock(pset);
4623 }
4624
4625 assert((count > 0) && (count <= (n_backup + 1)));
4626 for (int i = 0; i < count; i++) {
4627 assert(ipi_processor[i] != PROCESSOR_NULL);
4628 sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4629 }
4630 }
4631
4632
4633 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)4634 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4635 thread_t thread, __unused sched_ipi_event_t event)
4636 {
4637 #if defined(CONFIG_SCHED_DEFERRED_AST)
4638 #if CONFIG_THREAD_GROUPS
4639 if (thread) {
4640 struct thread_group *tg = thread_group_get(thread);
4641 if (thread_group_uses_immediate_ipi(tg)) {
4642 return SCHED_IPI_IMMEDIATE;
4643 }
4644 }
4645 #endif /* CONFIG_THREAD_GROUPS */
4646 if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4647 return SCHED_IPI_DEFERRED;
4648 }
4649 #else /* CONFIG_SCHED_DEFERRED_AST */
4650 (void) thread;
4651 panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4652 #endif /* CONFIG_SCHED_DEFERRED_AST */
4653 return SCHED_IPI_NONE;
4654 }
4655
4656 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)4657 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4658 {
4659 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4660 assert(dst != NULL);
4661
4662 processor_set_t pset = dst->processor_set;
4663 if (current_processor() == dst) {
4664 return SCHED_IPI_NONE;
4665 }
4666
4667 bool dst_idle = (dst->state == PROCESSOR_IDLE);
4668 if (dst_idle) {
4669 pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
4670 }
4671
4672 ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4673 switch (ipi_type) {
4674 case SCHED_IPI_NONE:
4675 return SCHED_IPI_NONE;
4676 #if defined(CONFIG_SCHED_DEFERRED_AST)
4677 case SCHED_IPI_DEFERRED:
4678 bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4679 break;
4680 #endif /* CONFIG_SCHED_DEFERRED_AST */
4681 default:
4682 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4683 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4684 dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4685 }
4686 bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4687 break;
4688 }
4689 return ipi_type;
4690 }
4691
4692 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4693 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4694 {
4695 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4696 boolean_t deferred_ipi_supported = false;
4697 processor_set_t pset = dst->processor_set;
4698
4699 #if defined(CONFIG_SCHED_DEFERRED_AST)
4700 deferred_ipi_supported = true;
4701 #endif /* CONFIG_SCHED_DEFERRED_AST */
4702
4703 switch (event) {
4704 case SCHED_IPI_EVENT_SPILL:
4705 case SCHED_IPI_EVENT_SMT_REBAL:
4706 case SCHED_IPI_EVENT_REBALANCE:
4707 case SCHED_IPI_EVENT_BOUND_THR:
4708 case SCHED_IPI_EVENT_RT_PREEMPT:
4709 /*
4710 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4711 * scenarios use immediate IPIs always.
4712 */
4713 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4714 break;
4715 case SCHED_IPI_EVENT_PREEMPT:
4716 /* In the preemption case, use immediate IPIs for RT threads */
4717 if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4718 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4719 break;
4720 }
4721
4722 /*
4723 * For Non-RT threads preemption,
4724 * If the core is active, use immediate IPIs.
4725 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4726 */
4727 if (deferred_ipi_supported && dst_idle) {
4728 return sched_ipi_deferred_policy(pset, dst, thread, event);
4729 }
4730 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4731 break;
4732 default:
4733 panic("Unrecognized scheduler IPI event type %d", event);
4734 }
4735 assert(ipi_type != SCHED_IPI_NONE);
4736 return ipi_type;
4737 }
4738
4739 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)4740 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4741 {
4742 switch (ipi) {
4743 case SCHED_IPI_NONE:
4744 break;
4745 case SCHED_IPI_IDLE:
4746 machine_signal_idle(dst);
4747 break;
4748 case SCHED_IPI_IMMEDIATE:
4749 cause_ast_check(dst);
4750 break;
4751 case SCHED_IPI_DEFERRED:
4752 machine_signal_idle_deferred(dst);
4753 break;
4754 default:
4755 panic("Unrecognized scheduler IPI type: %d", ipi);
4756 }
4757 }
4758
4759 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4760
4761 boolean_t
priority_is_urgent(int priority)4762 priority_is_urgent(int priority)
4763 {
4764 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
4765 }
4766
4767 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4768
4769 /*
4770 * processor_setrun:
4771 *
4772 * Dispatch a thread for execution on a
4773 * processor.
4774 *
4775 * Thread must be locked. Associated pset must
4776 * be locked, and is returned unlocked.
4777 */
4778 static void
processor_setrun(processor_t processor,thread_t thread,integer_t options)4779 processor_setrun(
4780 processor_t processor,
4781 thread_t thread,
4782 integer_t options)
4783 {
4784 processor_set_t pset = processor->processor_set;
4785 pset_assert_locked(pset);
4786 ast_t preempt;
4787 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4788
4789 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4790
4791 thread->chosen_processor = processor;
4792
4793 /*
4794 * Set preemption mode.
4795 */
4796 #if defined(CONFIG_SCHED_DEFERRED_AST)
4797 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4798 #endif
4799 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4800 preempt = (AST_PREEMPT | AST_URGENT);
4801 } else if (processor->current_is_eagerpreempt) {
4802 preempt = (AST_PREEMPT | AST_URGENT);
4803 } else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4804 if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4805 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4806 } else {
4807 preempt = AST_NONE;
4808 }
4809 } else {
4810 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4811 }
4812
4813 if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
4814 /*
4815 * Having gone to the trouble of forcing this thread off a less preferred core,
4816 * we should force the preferable core to reschedule immediately to give this
4817 * thread a chance to run instead of just sitting on the run queue where
4818 * it may just be stolen back by the idle core we just forced it off.
4819 */
4820 preempt |= AST_PREEMPT;
4821 }
4822
4823 SCHED(processor_enqueue)(processor, thread, options);
4824 sched_update_pset_load_average(pset, 0);
4825
4826 if (preempt != AST_NONE) {
4827 if (processor->state == PROCESSOR_IDLE) {
4828 ipi_action = eExitIdle;
4829 } else if (processor->state == PROCESSOR_DISPATCHING) {
4830 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4831 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4832 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4833 }
4834 } else if ((processor->state == PROCESSOR_RUNNING ||
4835 processor->state == PROCESSOR_SHUTDOWN) &&
4836 (thread->sched_pri >= processor->current_pri)) {
4837 ipi_action = eInterruptRunning;
4838 }
4839 } else {
4840 /*
4841 * New thread is not important enough to preempt what is running, but
4842 * special processor states may need special handling
4843 */
4844 if (processor->state == PROCESSOR_SHUTDOWN &&
4845 thread->sched_pri >= processor->current_pri) {
4846 ipi_action = eInterruptRunning;
4847 } else if (processor->state == PROCESSOR_IDLE) {
4848 ipi_action = eExitIdle;
4849 } else if (processor->state == PROCESSOR_DISPATCHING) {
4850 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4851 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4852 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
4853 }
4854 }
4855 }
4856
4857 if (ipi_action != eDoNothing) {
4858 if (processor == current_processor()) {
4859 if (ipi_action == eExitIdle) {
4860 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4861 }
4862 if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
4863 ast_on(preempt);
4864 }
4865
4866 if ((preempt & AST_URGENT) == AST_URGENT) {
4867 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4868 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4869 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
4870 }
4871 } else {
4872 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4873 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
4874 }
4875 }
4876
4877 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4878 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4879 } else {
4880 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4881 }
4882 } else {
4883 sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
4884 ipi_type = sched_ipi_action(processor, thread, event);
4885 }
4886 }
4887 pset_unlock(pset);
4888 sched_ipi_perform(processor, ipi_type);
4889 }
4890
4891 /*
4892 * choose_next_pset:
4893 *
4894 * Return the next sibling pset containing
4895 * available processors.
4896 *
4897 * Returns the original pset if none other is
4898 * suitable.
4899 */
4900 static processor_set_t
choose_next_pset(processor_set_t pset)4901 choose_next_pset(
4902 processor_set_t pset)
4903 {
4904 processor_set_t nset = pset;
4905
4906 do {
4907 nset = next_pset(nset);
4908 } while (nset->online_processor_count < 1 && nset != pset);
4909
4910 return nset;
4911 }
4912
4913 /*
4914 * choose_processor:
4915 *
4916 * Choose a processor for the thread, beginning at
4917 * the pset. Accepts an optional processor hint in
4918 * the pset.
4919 *
4920 * Returns a processor, possibly from a different pset.
4921 *
4922 * The thread must be locked. The pset must be locked,
4923 * and the resulting pset is locked on return.
4924 */
4925 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread)4926 choose_processor(
4927 processor_set_t starting_pset,
4928 processor_t processor,
4929 thread_t thread)
4930 {
4931 processor_set_t pset = starting_pset;
4932 processor_set_t nset;
4933
4934 assert(thread->sched_pri <= MAXPRI);
4935
4936 /*
4937 * Prefer the hinted processor, when appropriate.
4938 */
4939
4940 /* Fold last processor hint from secondary processor to its primary */
4941 if (processor != PROCESSOR_NULL) {
4942 processor = processor->processor_primary;
4943 }
4944
4945 /*
4946 * Only consult platform layer if pset is active, which
4947 * it may not be in some cases when a multi-set system
4948 * is going to sleep.
4949 */
4950 if (pset->online_processor_count) {
4951 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
4952 processor_t mc_processor = machine_choose_processor(pset, processor);
4953 if (mc_processor != PROCESSOR_NULL) {
4954 processor = mc_processor->processor_primary;
4955 }
4956 }
4957 }
4958
4959 /*
4960 * At this point, we may have a processor hint, and we may have
4961 * an initial starting pset. If the hint is not in the pset, or
4962 * if the hint is for a processor in an invalid state, discard
4963 * the hint.
4964 */
4965 if (processor != PROCESSOR_NULL) {
4966 if (processor->processor_set != pset) {
4967 processor = PROCESSOR_NULL;
4968 } else if (!processor->is_recommended) {
4969 processor = PROCESSOR_NULL;
4970 } else {
4971 switch (processor->state) {
4972 case PROCESSOR_START:
4973 case PROCESSOR_SHUTDOWN:
4974 case PROCESSOR_PENDING_OFFLINE:
4975 case PROCESSOR_OFF_LINE:
4976 /*
4977 * Hint is for a processor that cannot support running new threads.
4978 */
4979 processor = PROCESSOR_NULL;
4980 break;
4981 case PROCESSOR_IDLE:
4982 /*
4983 * Hint is for an idle processor. Assume it is no worse than any other
4984 * idle processor. The platform layer had an opportunity to provide
4985 * the "least cost idle" processor above.
4986 */
4987 if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
4988 return processor;
4989 }
4990 processor = PROCESSOR_NULL;
4991 break;
4992 case PROCESSOR_RUNNING:
4993 case PROCESSOR_DISPATCHING:
4994 /*
4995 * Hint is for an active CPU. This fast-path allows
4996 * realtime threads to preempt non-realtime threads
4997 * to regain their previous executing processor.
4998 */
4999 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5000 if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5001 return processor;
5002 }
5003 processor = PROCESSOR_NULL;
5004 }
5005
5006 /* Otherwise, use hint as part of search below */
5007 break;
5008 default:
5009 processor = PROCESSOR_NULL;
5010 break;
5011 }
5012 }
5013 }
5014
5015 /*
5016 * Iterate through the processor sets to locate
5017 * an appropriate processor. Seed results with
5018 * a last-processor hint, if available, so that
5019 * a search must find something strictly better
5020 * to replace it.
5021 *
5022 * A primary/secondary pair of SMT processors are
5023 * "unpaired" if the primary is busy but its
5024 * corresponding secondary is idle (so the physical
5025 * core has full use of its resources).
5026 */
5027
5028 integer_t lowest_priority = MAXPRI + 1;
5029 integer_t lowest_secondary_priority = MAXPRI + 1;
5030 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
5031 integer_t lowest_idle_secondary_priority = MAXPRI + 1;
5032 integer_t lowest_count = INT_MAX;
5033 processor_t lp_processor = PROCESSOR_NULL;
5034 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
5035 processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
5036 processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
5037 processor_t lc_processor = PROCESSOR_NULL;
5038
5039 if (processor != PROCESSOR_NULL) {
5040 /* All other states should be enumerated above. */
5041 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5042 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5043
5044 lowest_priority = processor->current_pri;
5045 lp_processor = processor;
5046
5047 lowest_count = SCHED(processor_runq_count)(processor);
5048 lc_processor = processor;
5049 }
5050
5051 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5052 pset_node_t node = pset->node;
5053 bool include_ast_urgent_pending_cpus = false;
5054 cpumap_t ast_urgent_pending;
5055 try_again:
5056 ast_urgent_pending = 0;
5057 int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
5058 for (; consider_secondaries < 2; consider_secondaries++) {
5059 pset = change_locked_pset(pset, starting_pset);
5060 do {
5061 cpumap_t available_map = pset_available_cpumap(pset);
5062 if (available_map == 0) {
5063 goto no_available_cpus;
5064 }
5065
5066 processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5067 if (processor) {
5068 return processor;
5069 }
5070
5071 if (consider_secondaries) {
5072 processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5073 if (processor) {
5074 /*
5075 * Instead of looping through all the psets to find the global
5076 * furthest deadline processor, preempt the first candidate found.
5077 * The preempted thread will then find any other available far deadline
5078 * processors to preempt.
5079 */
5080 return processor;
5081 }
5082
5083 ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5084
5085 if (rt_runq_count(pset) < lowest_count) {
5086 int cpuid = bit_first(available_map);
5087 assert(cpuid >= 0);
5088 lc_processor = processor_array[cpuid];
5089 lowest_count = rt_runq_count(pset);
5090 }
5091 }
5092
5093 no_available_cpus:
5094 nset = next_pset(pset);
5095
5096 if (nset != starting_pset) {
5097 pset = change_locked_pset(pset, nset);
5098 }
5099 } while (nset != starting_pset);
5100 }
5101
5102 /* Short cut for single pset nodes */
5103 if (bit_count(node->pset_map) == 1) {
5104 if (lc_processor) {
5105 pset_assert_locked(lc_processor->processor_set);
5106 return lc_processor;
5107 }
5108 } else {
5109 if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5110 /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5111 include_ast_urgent_pending_cpus = true;
5112 goto try_again;
5113 }
5114 }
5115
5116 processor = lc_processor;
5117
5118 if (processor) {
5119 pset = change_locked_pset(pset, processor->processor_set);
5120 /* Check that chosen processor is still usable */
5121 cpumap_t available_map = pset_available_cpumap(pset);
5122 if (bit_test(available_map, processor->cpu_id)) {
5123 return processor;
5124 }
5125
5126 /* processor is no longer usable */
5127 processor = PROCESSOR_NULL;
5128 }
5129
5130 pset_assert_locked(pset);
5131 pset_unlock(pset);
5132 return PROCESSOR_NULL;
5133 }
5134
5135 /* No realtime threads from this point on */
5136 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5137
5138 do {
5139 /*
5140 * Choose an idle processor, in pset traversal order
5141 */
5142
5143 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5144 pset->primary_map &
5145 pset->recommended_bitmask);
5146
5147 /* there shouldn't be a pending AST if the processor is idle */
5148 assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5149
5150 int cpuid = lsb_first(idle_primary_map);
5151 if (cpuid >= 0) {
5152 processor = processor_array[cpuid];
5153 return processor;
5154 }
5155
5156 /*
5157 * Otherwise, enumerate active and idle processors to find primary candidates
5158 * with lower priority/etc.
5159 */
5160
5161 uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5162 pset->recommended_bitmask &
5163 ~pset->pending_AST_URGENT_cpu_mask);
5164
5165 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5166 active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5167 }
5168
5169 active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5170 for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5171 cpuid = ((rotid + pset->last_chosen + 1) & 63);
5172 processor = processor_array[cpuid];
5173
5174 integer_t cpri = processor->current_pri;
5175 processor_t primary = processor->processor_primary;
5176 if (primary != processor) {
5177 /* If primary is running a NO_SMT thread, don't choose its secondary */
5178 if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
5179 if (cpri < lowest_secondary_priority) {
5180 lowest_secondary_priority = cpri;
5181 lp_paired_secondary_processor = processor;
5182 }
5183 }
5184 } else {
5185 if (cpri < lowest_priority) {
5186 lowest_priority = cpri;
5187 lp_processor = processor;
5188 }
5189 }
5190
5191 integer_t ccount = SCHED(processor_runq_count)(processor);
5192 if (ccount < lowest_count) {
5193 lowest_count = ccount;
5194 lc_processor = processor;
5195 }
5196 }
5197
5198 /*
5199 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5200 * the idle primary would have short-circuited the loop above
5201 */
5202 uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5203 ~pset->primary_map &
5204 pset->recommended_bitmask);
5205
5206 /* there shouldn't be a pending AST if the processor is idle */
5207 assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5208 assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5209
5210 for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
5211 processor = processor_array[cpuid];
5212
5213 processor_t cprimary = processor->processor_primary;
5214
5215 integer_t primary_pri = cprimary->current_pri;
5216
5217 /*
5218 * TODO: This should also make the same decisions
5219 * as secondary_can_run_realtime_thread
5220 *
5221 * TODO: Keep track of the pending preemption priority
5222 * of the primary to make this more accurate.
5223 */
5224
5225 /* If the primary is running a no-smt thread, then don't choose its secondary */
5226 if (cprimary->state == PROCESSOR_RUNNING &&
5227 processor_active_thread_no_smt(cprimary)) {
5228 continue;
5229 }
5230
5231 /*
5232 * Find the idle secondary processor with the lowest priority primary
5233 *
5234 * We will choose this processor as a fallback if we find no better
5235 * primary to preempt.
5236 */
5237 if (primary_pri < lowest_idle_secondary_priority) {
5238 lp_idle_secondary_processor = processor;
5239 lowest_idle_secondary_priority = primary_pri;
5240 }
5241
5242 /* Find the the lowest priority active primary with idle secondary */
5243 if (primary_pri < lowest_unpaired_primary_priority) {
5244 /* If the primary processor is offline or starting up, it's not a candidate for this path */
5245 if (cprimary->state != PROCESSOR_RUNNING &&
5246 cprimary->state != PROCESSOR_DISPATCHING) {
5247 continue;
5248 }
5249
5250 if (!cprimary->is_recommended) {
5251 continue;
5252 }
5253
5254 /* if the primary is pending preemption, don't try to re-preempt it */
5255 if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5256 continue;
5257 }
5258
5259 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5260 bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5261 continue;
5262 }
5263
5264 lowest_unpaired_primary_priority = primary_pri;
5265 lp_unpaired_primary_processor = cprimary;
5266 }
5267 }
5268
5269 /*
5270 * We prefer preempting a primary processor over waking up its secondary.
5271 * The secondary will then be woken up by the preempted thread.
5272 */
5273 if (thread->sched_pri > lowest_unpaired_primary_priority) {
5274 pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5275 return lp_unpaired_primary_processor;
5276 }
5277
5278 /*
5279 * We prefer preempting a lower priority active processor over directly
5280 * waking up an idle secondary.
5281 * The preempted thread will then find the idle secondary.
5282 */
5283 if (thread->sched_pri > lowest_priority) {
5284 pset->last_chosen = lp_processor->cpu_id;
5285 return lp_processor;
5286 }
5287
5288 /*
5289 * lc_processor is used to indicate the best processor set run queue
5290 * on which to enqueue a thread when all available CPUs are busy with
5291 * higher priority threads, so try to make sure it is initialized.
5292 */
5293 if (lc_processor == PROCESSOR_NULL) {
5294 cpumap_t available_map = pset_available_cpumap(pset);
5295 cpuid = lsb_first(available_map);
5296 if (cpuid >= 0) {
5297 lc_processor = processor_array[cpuid];
5298 lowest_count = SCHED(processor_runq_count)(lc_processor);
5299 }
5300 }
5301
5302 /*
5303 * Move onto the next processor set.
5304 *
5305 * If all primary processors in this pset are running a higher
5306 * priority thread, move on to next pset. Only when we have
5307 * exhausted the search for primary processors do we
5308 * fall back to secondaries.
5309 */
5310 #if CONFIG_SCHED_EDGE
5311 /*
5312 * The edge scheduler expects a CPU to be selected from the pset it passed in
5313 * as the starting pset for non-RT workloads. The edge migration algorithm
5314 * should already have considered idle CPUs and loads to decide the starting_pset;
5315 * which means that this loop can be short-circuted.
5316 */
5317 nset = starting_pset;
5318 #else /* CONFIG_SCHED_EDGE */
5319 nset = next_pset(pset);
5320 #endif /* CONFIG_SCHED_EDGE */
5321
5322 if (nset != starting_pset) {
5323 pset = change_locked_pset(pset, nset);
5324 }
5325 } while (nset != starting_pset);
5326
5327 /*
5328 * Make sure that we pick a running processor,
5329 * and that the correct processor set is locked.
5330 * Since we may have unlocked the candidate processor's
5331 * pset, it may have changed state.
5332 *
5333 * All primary processors are running a higher priority
5334 * thread, so the only options left are enqueuing on
5335 * the secondary processor that would perturb the least priority
5336 * primary, or the least busy primary.
5337 */
5338
5339 /* lowest_priority is evaluated in the main loops above */
5340 if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5341 processor = lp_idle_secondary_processor;
5342 } else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5343 processor = lp_paired_secondary_processor;
5344 } else if (lc_processor != PROCESSOR_NULL) {
5345 processor = lc_processor;
5346 } else {
5347 processor = PROCESSOR_NULL;
5348 }
5349
5350 if (processor) {
5351 pset = change_locked_pset(pset, processor->processor_set);
5352 /* Check that chosen processor is still usable */
5353 cpumap_t available_map = pset_available_cpumap(pset);
5354 if (bit_test(available_map, processor->cpu_id)) {
5355 pset->last_chosen = processor->cpu_id;
5356 return processor;
5357 }
5358
5359 /* processor is no longer usable */
5360 processor = PROCESSOR_NULL;
5361 }
5362
5363 pset_assert_locked(pset);
5364 pset_unlock(pset);
5365 return PROCESSOR_NULL;
5366 }
5367
5368 /*
5369 * Default implementation of SCHED(choose_node)()
5370 * for single node systems
5371 */
5372 pset_node_t
sched_choose_node(__unused thread_t thread)5373 sched_choose_node(__unused thread_t thread)
5374 {
5375 return &pset_node0;
5376 }
5377
5378 /*
5379 * choose_starting_pset:
5380 *
5381 * Choose a starting processor set for the thread.
5382 * May return a processor hint within the pset.
5383 *
5384 * Returns a starting processor set, to be used by
5385 * choose_processor.
5386 *
5387 * The thread must be locked. The resulting pset is unlocked on return,
5388 * and is chosen without taking any pset locks.
5389 */
5390 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)5391 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5392 {
5393 processor_set_t pset;
5394 processor_t processor = PROCESSOR_NULL;
5395
5396 if (thread->affinity_set != AFFINITY_SET_NULL) {
5397 /*
5398 * Use affinity set policy hint.
5399 */
5400 pset = thread->affinity_set->aset_pset;
5401 } else if (thread->last_processor != PROCESSOR_NULL) {
5402 /*
5403 * Simple (last processor) affinity case.
5404 */
5405 processor = thread->last_processor;
5406 pset = processor->processor_set;
5407 } else {
5408 /*
5409 * No Affinity case:
5410 *
5411 * Utilitize a per task hint to spread threads
5412 * among the available processor sets.
5413 * NRG this seems like the wrong thing to do.
5414 * See also task->pset_hint = pset in thread_setrun()
5415 */
5416 pset = get_threadtask(thread)->pset_hint;
5417 if (pset == PROCESSOR_SET_NULL) {
5418 pset = current_processor()->processor_set;
5419 }
5420
5421 pset = choose_next_pset(pset);
5422 }
5423
5424 if (!bit_test(node->pset_map, pset->pset_id)) {
5425 /* pset is not from this node so choose one that is */
5426 int id = lsb_first(node->pset_map);
5427 if (id < 0) {
5428 /* startup race, so check again under the node lock */
5429 lck_spin_lock(&pset_node_lock);
5430 if (bit_test(node->pset_map, pset->pset_id)) {
5431 id = pset->pset_id;
5432 } else {
5433 id = lsb_first(node->pset_map);
5434 }
5435 lck_spin_unlock(&pset_node_lock);
5436 }
5437 assert(id >= 0);
5438 pset = pset_array[id];
5439 }
5440
5441 if (bit_count(node->pset_map) == 1) {
5442 /* Only a single pset in this node */
5443 goto out;
5444 }
5445
5446 bool avoid_cpu0 = false;
5447
5448 #if defined(__x86_64__)
5449 if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5450 /* Avoid the pset containing cpu0 */
5451 avoid_cpu0 = true;
5452 /* Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ */
5453 assert(bit_test(pset_array[0]->cpu_bitmask, 0));
5454 }
5455 #endif
5456
5457 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5458 pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5459 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5460 if (avoid_cpu0) {
5461 rt_target_map = bit_ror64(rt_target_map, 1);
5462 }
5463 int rotid = lsb_first(rt_target_map);
5464 if (rotid >= 0) {
5465 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5466 pset = pset_array[id];
5467 goto out;
5468 }
5469 }
5470 if (!pset->is_SMT || !sched_allow_rt_smt) {
5471 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5472 goto out;
5473 }
5474 rt_target_map = atomic_load(&node->pset_non_rt_map);
5475 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5476 if (avoid_cpu0) {
5477 rt_target_map = bit_ror64(rt_target_map, 1);
5478 }
5479 int rotid = lsb_first(rt_target_map);
5480 if (rotid >= 0) {
5481 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5482 pset = pset_array[id];
5483 goto out;
5484 }
5485 }
5486 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5487 } else {
5488 pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5489 if (!bit_test(idle_map, pset->pset_id)) {
5490 int next_idle_pset_id = lsb_first(idle_map);
5491 if (next_idle_pset_id >= 0) {
5492 pset = pset_array[next_idle_pset_id];
5493 }
5494 }
5495 }
5496
5497 out:
5498 if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5499 processor = PROCESSOR_NULL;
5500 }
5501 if (processor != PROCESSOR_NULL) {
5502 *processor_hint = processor;
5503 }
5504
5505 assert(pset != NULL);
5506 return pset;
5507 }
5508
5509 /*
5510 * thread_setrun:
5511 *
5512 * Dispatch thread for execution, onto an idle
5513 * processor or run queue, and signal a preemption
5514 * as appropriate.
5515 *
5516 * Thread must be locked.
5517 */
5518 void
thread_setrun(thread_t thread,sched_options_t options)5519 thread_setrun(
5520 thread_t thread,
5521 sched_options_t options)
5522 {
5523 processor_t processor = PROCESSOR_NULL;
5524 processor_set_t pset;
5525
5526 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5527 assert(thread->runq == PROCESSOR_NULL);
5528
5529 #if CONFIG_PREADOPT_TG
5530 /* We know that the thread is not in the runq by virtue of being in this
5531 * function and the thread is not self since we are running. We can safely
5532 * resolve the thread group hierarchy and modify the thread's thread group
5533 * here. */
5534 thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
5535 #endif
5536
5537 /*
5538 * Update priority if needed.
5539 */
5540 if (SCHED(can_update_priority)(thread)) {
5541 SCHED(update_priority)(thread);
5542 }
5543 thread->sfi_class = sfi_thread_classify(thread);
5544
5545 if (thread->bound_processor == PROCESSOR_NULL) {
5546 /*
5547 * Unbound case.
5548 *
5549 * Usually, this loop will only be executed once,
5550 * but if CLPC derecommends a processor after it has been chosen,
5551 * or if a processor is shut down after it is chosen,
5552 * choose_processor() may return NULL, so a retry
5553 * may be necessary. A single retry will usually
5554 * be enough, and we can't afford to retry too many times
5555 * because interrupts are disabled.
5556 */
5557 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
5558 for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5559 processor_t processor_hint = PROCESSOR_NULL;
5560 pset_node_t node = SCHED(choose_node)(thread);
5561 processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
5562
5563 pset_lock(starting_pset);
5564
5565 processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5566 if (processor != PROCESSOR_NULL) {
5567 pset = processor->processor_set;
5568 pset_assert_locked(pset);
5569 break;
5570 }
5571 }
5572 /*
5573 * If choose_processor() still returns NULL,
5574 * which is very unlikely,
5575 * choose the master_processor, which is always
5576 * safe to choose.
5577 */
5578 if (processor == PROCESSOR_NULL) {
5579 /* Choose fallback processor */
5580 processor = master_processor;
5581 pset = processor->processor_set;
5582 pset_lock(pset);
5583 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5584 }
5585 task_t task = get_threadtask(thread);
5586 if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5587 task->pset_hint = pset; /* NRG this is done without holding the task lock */
5588 }
5589 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5590 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5591 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5592 } else {
5593 /*
5594 * Bound case:
5595 *
5596 * Unconditionally dispatch on the processor.
5597 */
5598 processor = thread->bound_processor;
5599 pset = processor->processor_set;
5600 pset_lock(pset);
5601
5602 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5603 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5604 }
5605
5606 /*
5607 * Dispatch the thread on the chosen processor.
5608 * TODO: This should be based on sched_mode, not sched_pri
5609 */
5610 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5611 realtime_setrun(processor, thread);
5612 } else {
5613 processor_setrun(processor, thread, options);
5614 }
5615 /* pset is now unlocked */
5616 if (thread->bound_processor == PROCESSOR_NULL) {
5617 SCHED(check_spill)(pset, thread);
5618 }
5619 }
5620
5621 processor_set_t
task_choose_pset(task_t task)5622 task_choose_pset(
5623 task_t task)
5624 {
5625 processor_set_t pset = task->pset_hint;
5626
5627 if (pset != PROCESSOR_SET_NULL) {
5628 pset = choose_next_pset(pset);
5629 }
5630
5631 return pset;
5632 }
5633
5634 /*
5635 * Check for a preemption point in
5636 * the current context.
5637 *
5638 * Called at splsched with thread locked.
5639 */
5640 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)5641 csw_check(
5642 thread_t thread,
5643 processor_t processor,
5644 ast_t check_reason)
5645 {
5646 processor_set_t pset = processor->processor_set;
5647
5648 assert(thread == processor->active_thread);
5649
5650 pset_lock(pset);
5651
5652 processor_state_update_from_thread(processor, thread, true);
5653
5654 ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5655
5656 /* Acknowledge the IPI if we decided not to preempt */
5657
5658 if ((preempt & AST_URGENT) == 0) {
5659 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5660 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5661 }
5662 }
5663
5664 if ((preempt & AST_PREEMPT) == 0) {
5665 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5666 }
5667
5668 pset_unlock(pset);
5669
5670 return preempt;
5671 }
5672
5673 /*
5674 * Check for preemption at splsched with
5675 * pset and thread locked
5676 */
5677 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)5678 csw_check_locked(
5679 thread_t thread,
5680 processor_t processor,
5681 processor_set_t pset,
5682 ast_t check_reason)
5683 {
5684 /*
5685 * If the current thread is running on a processor that is no longer recommended,
5686 * urgently preempt it, at which point thread_select() should
5687 * try to idle the processor and re-dispatch the thread to a recommended processor.
5688 */
5689 if (!processor->is_recommended) {
5690 return check_reason | AST_PREEMPT | AST_URGENT;
5691 }
5692
5693 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5694 return check_reason | AST_PREEMPT | AST_URGENT;
5695 }
5696
5697 if (rt_runq_count(pset) > 0) {
5698 if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5699 return check_reason | AST_PREEMPT | AST_URGENT;
5700 } else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) {
5701 return check_reason | AST_PREEMPT | AST_URGENT;
5702 } else {
5703 return check_reason | AST_PREEMPT;
5704 }
5705 }
5706
5707 ast_t result = SCHED(processor_csw_check)(processor);
5708 if (result != AST_NONE) {
5709 return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
5710 }
5711
5712 /*
5713 * Same for avoid-processor
5714 *
5715 * TODO: Should these set AST_REBALANCE?
5716 */
5717 if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)) {
5718 return check_reason | AST_PREEMPT;
5719 }
5720
5721 /*
5722 * Even though we could continue executing on this processor, a
5723 * secondary SMT core should try to shed load to another primary core.
5724 *
5725 * TODO: Should this do the same check that thread_select does? i.e.
5726 * if no bound threads target this processor, and idle primaries exist, preempt
5727 * The case of RT threads existing is already taken care of above
5728 */
5729
5730 if (processor->current_pri < BASEPRI_RTQUEUES &&
5731 processor->processor_primary != processor) {
5732 return check_reason | AST_PREEMPT;
5733 }
5734
5735 if (thread->state & TH_SUSP) {
5736 return check_reason | AST_PREEMPT;
5737 }
5738
5739 #if CONFIG_SCHED_SFI
5740 /*
5741 * Current thread may not need to be preempted, but maybe needs
5742 * an SFI wait?
5743 */
5744 result = sfi_thread_needs_ast(thread, NULL);
5745 if (result != AST_NONE) {
5746 return check_reason | result;
5747 }
5748 #endif
5749
5750 return AST_NONE;
5751 }
5752
5753 /*
5754 * Handle preemption IPI or IPI in response to setting an AST flag
5755 * Triggered by cause_ast_check
5756 * Called at splsched
5757 */
5758 void
ast_check(processor_t processor)5759 ast_check(processor_t processor)
5760 {
5761 if (processor->state != PROCESSOR_RUNNING &&
5762 processor->state != PROCESSOR_SHUTDOWN) {
5763 return;
5764 }
5765
5766 thread_t thread = processor->active_thread;
5767
5768 assert(thread == current_thread());
5769
5770 /*
5771 * Pairs with task_restartable_ranges_synchronize
5772 */
5773 thread_lock(thread);
5774
5775 thread_reset_pcs_ack_IPI(thread);
5776
5777 /*
5778 * Propagate thread ast to processor.
5779 * (handles IPI in response to setting AST flag)
5780 */
5781 ast_propagate(thread);
5782
5783 /*
5784 * Stash the old urgency and perfctl values to find out if
5785 * csw_check updates them.
5786 */
5787 thread_urgency_t old_urgency = processor->current_urgency;
5788 perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
5789
5790 ast_t preempt;
5791
5792 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5793 ast_on(preempt);
5794 }
5795
5796 if (old_urgency != processor->current_urgency) {
5797 /*
5798 * Urgency updates happen with the thread lock held (ugh).
5799 * TODO: This doesn't notice QoS changes...
5800 */
5801 uint64_t urgency_param1, urgency_param2;
5802
5803 thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
5804 thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
5805 }
5806
5807 thread_unlock(thread);
5808
5809 if (old_perfctl_class != processor->current_perfctl_class) {
5810 /*
5811 * We updated the perfctl class of this thread from another core.
5812 * Let CLPC know that the currently running thread has a new
5813 * class.
5814 */
5815
5816 machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
5817 mach_approximate_time(), 0, thread);
5818 }
5819 }
5820
5821
5822 /*
5823 * set_sched_pri:
5824 *
5825 * Set the scheduled priority of the specified thread.
5826 *
5827 * This may cause the thread to change queues.
5828 *
5829 * Thread must be locked.
5830 */
5831 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)5832 set_sched_pri(
5833 thread_t thread,
5834 int16_t new_priority,
5835 set_sched_pri_options_t options)
5836 {
5837 bool is_current_thread = (thread == current_thread());
5838 bool removed_from_runq = false;
5839 bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
5840
5841 int16_t old_priority = thread->sched_pri;
5842
5843 /* If we're already at this priority, no need to mess with the runqueue */
5844 if (new_priority == old_priority) {
5845 #if CONFIG_SCHED_CLUTCH
5846 /* For the first thread in the system, the priority is correct but
5847 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
5848 * scheduler relies on the bucket being set for all threads, update
5849 * its bucket here.
5850 */
5851 if (thread->th_sched_bucket == TH_BUCKET_RUN) {
5852 assert(thread == vm_pageout_scan_thread);
5853 SCHED(update_thread_bucket)(thread);
5854 }
5855 #endif /* CONFIG_SCHED_CLUTCH */
5856
5857 return;
5858 }
5859
5860 if (is_current_thread) {
5861 assert(thread->state & TH_RUN);
5862 assert(thread->runq == PROCESSOR_NULL);
5863 } else {
5864 removed_from_runq = thread_run_queue_remove(thread);
5865 }
5866
5867 thread->sched_pri = new_priority;
5868
5869 #if CONFIG_SCHED_CLUTCH
5870 /*
5871 * Since for the clutch scheduler, the thread's bucket determines its runq
5872 * in the hierarchy it is important to update the bucket when the thread
5873 * lock is held and the thread has been removed from the runq hierarchy.
5874 */
5875 SCHED(update_thread_bucket)(thread);
5876
5877 #endif /* CONFIG_SCHED_CLUTCH */
5878
5879 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
5880 (uintptr_t)thread_tid(thread),
5881 thread->base_pri,
5882 thread->sched_pri,
5883 thread->sched_usage,
5884 0);
5885
5886 if (removed_from_runq) {
5887 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
5888 } else if (is_current_thread) {
5889 processor_t processor = thread->last_processor;
5890 assert(processor == current_processor());
5891
5892 thread_urgency_t old_urgency = processor->current_urgency;
5893
5894 /*
5895 * When dropping in priority, check if the thread no longer belongs on core.
5896 * If a thread raises its own priority, don't aggressively rebalance it.
5897 * <rdar://problem/31699165>
5898 *
5899 * csw_check does a processor_state_update_from_thread, but
5900 * we should do our own if we're being lazy.
5901 */
5902 if (!lazy_update && new_priority < old_priority) {
5903 ast_t preempt;
5904
5905 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5906 ast_on(preempt);
5907 }
5908 } else {
5909 processor_state_update_from_thread(processor, thread, false);
5910 }
5911
5912 /*
5913 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
5914 * class alterations from user space to occur relatively infrequently, hence
5915 * those are lazily handled. QoS classes have distinct priority bands, and QoS
5916 * inheritance is expected to involve priority changes.
5917 */
5918 if (processor->current_urgency != old_urgency) {
5919 uint64_t urgency_param1, urgency_param2;
5920
5921 thread_urgency_t new_urgency = thread_get_urgency(thread,
5922 &urgency_param1, &urgency_param2);
5923
5924 thread_tell_urgency(new_urgency, urgency_param1,
5925 urgency_param2, 0, thread);
5926 }
5927
5928 /* TODO: only call this if current_perfctl_class changed */
5929 uint64_t ctime = mach_approximate_time();
5930 machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
5931 } else if (thread->state & TH_RUN) {
5932 processor_t processor = thread->last_processor;
5933
5934 if (!lazy_update &&
5935 processor != PROCESSOR_NULL &&
5936 processor != current_processor() &&
5937 processor->active_thread == thread) {
5938 cause_ast_check(processor);
5939 }
5940 }
5941 }
5942
5943 /*
5944 * thread_run_queue_remove_for_handoff
5945 *
5946 * Pull a thread or its (recursive) push target out of the runqueue
5947 * so that it is ready for thread_run()
5948 *
5949 * Called at splsched
5950 *
5951 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
5952 * This may be different than the thread that was passed in.
5953 */
5954 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)5955 thread_run_queue_remove_for_handoff(thread_t thread)
5956 {
5957 thread_t pulled_thread = THREAD_NULL;
5958
5959 thread_lock(thread);
5960
5961 /*
5962 * Check that the thread is not bound to a different processor,
5963 * NO_SMT flag is not set on the thread, cluster type of
5964 * processor matches with thread if the thread is pinned to a
5965 * particular cluster and that realtime is not involved.
5966 *
5967 * Next, pull it off its run queue. If it doesn't come, it's not eligible.
5968 */
5969 processor_t processor = current_processor();
5970 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
5971 && (!thread_no_smt(thread))
5972 && (processor->current_pri < BASEPRI_RTQUEUES)
5973 && (thread->sched_pri < BASEPRI_RTQUEUES)
5974 #if __AMP__
5975 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
5976 processor->processor_set->pset_id == thread->th_bound_cluster_id)
5977 #endif /* __AMP__ */
5978 ) {
5979 if (thread_run_queue_remove(thread)) {
5980 pulled_thread = thread;
5981 }
5982 }
5983
5984 thread_unlock(thread);
5985
5986 return pulled_thread;
5987 }
5988
5989 /*
5990 * thread_prepare_for_handoff
5991 *
5992 * Make the thread ready for handoff.
5993 * If the thread was runnable then pull it off the runq, if the thread could
5994 * not be pulled, return NULL.
5995 *
5996 * If the thread was woken up from wait for handoff, make sure it is not bound to
5997 * different processor.
5998 *
5999 * Called at splsched
6000 *
6001 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6002 * This may be different than the thread that was passed in.
6003 */
6004 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)6005 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6006 {
6007 thread_t pulled_thread = THREAD_NULL;
6008
6009 if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6010 processor_t processor = current_processor();
6011 thread_lock(thread);
6012
6013 /*
6014 * Check that the thread is not bound to a different processor,
6015 * NO_SMT flag is not set on the thread and cluster type of
6016 * processor matches with thread if the thread is pinned to a
6017 * particular cluster. Call setrun instead if above conditions
6018 * are not satisfied.
6019 */
6020 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6021 && (!thread_no_smt(thread))
6022 #if __AMP__
6023 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6024 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6025 #endif /* __AMP__ */
6026 ) {
6027 pulled_thread = thread;
6028 } else {
6029 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
6030 }
6031 thread_unlock(thread);
6032 } else {
6033 pulled_thread = thread_run_queue_remove_for_handoff(thread);
6034 }
6035
6036 return pulled_thread;
6037 }
6038
6039 /*
6040 * thread_run_queue_remove:
6041 *
6042 * Remove a thread from its current run queue and
6043 * return TRUE if successful.
6044 *
6045 * Thread must be locked.
6046 *
6047 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6048 * run queues because the caller locked the thread. Otherwise
6049 * the thread is on a run queue, but could be chosen for dispatch
6050 * and removed by another processor under a different lock, which
6051 * will set thread->runq to PROCESSOR_NULL.
6052 *
6053 * Hence the thread select path must not rely on anything that could
6054 * be changed under the thread lock after calling this function,
6055 * most importantly thread->sched_pri.
6056 */
6057 boolean_t
thread_run_queue_remove(thread_t thread)6058 thread_run_queue_remove(
6059 thread_t thread)
6060 {
6061 boolean_t removed = FALSE;
6062 processor_t processor = thread->runq;
6063
6064 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6065 /* Thread isn't runnable */
6066 assert(thread->runq == PROCESSOR_NULL);
6067 return FALSE;
6068 }
6069
6070 if (processor == PROCESSOR_NULL) {
6071 /*
6072 * The thread is either not on the runq,
6073 * or is in the midst of being removed from the runq.
6074 *
6075 * runq is set to NULL under the pset lock, not the thread
6076 * lock, so the thread may still be in the process of being dequeued
6077 * from the runq. It will wait in invoke for the thread lock to be
6078 * dropped.
6079 */
6080
6081 return FALSE;
6082 }
6083
6084 if (thread->sched_pri < BASEPRI_RTQUEUES) {
6085 return SCHED(processor_queue_remove)(processor, thread);
6086 }
6087
6088 processor_set_t pset = processor->processor_set;
6089
6090 pset_lock(pset);
6091
6092 if (thread->runq != PROCESSOR_NULL) {
6093 /*
6094 * Thread is on the RT run queue and we have a lock on
6095 * that run queue.
6096 */
6097 rt_runq_remove(SCHED(rt_runq)(pset), thread);
6098 pset_update_rt_stealable_state(pset);
6099
6100 removed = TRUE;
6101 }
6102
6103 pset_unlock(pset);
6104
6105 return removed;
6106 }
6107
6108 /*
6109 * Put the thread back where it goes after a thread_run_queue_remove
6110 *
6111 * Thread must have been removed under the same thread lock hold
6112 *
6113 * thread locked, at splsched
6114 */
6115 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)6116 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6117 {
6118 assert(thread->runq == PROCESSOR_NULL);
6119 assert(thread->state & (TH_RUN));
6120
6121 thread_setrun(thread, options);
6122 }
6123
6124 void
sys_override_cpu_throttle(boolean_t enable_override)6125 sys_override_cpu_throttle(boolean_t enable_override)
6126 {
6127 if (enable_override) {
6128 cpu_throttle_enabled = 0;
6129 } else {
6130 cpu_throttle_enabled = 1;
6131 }
6132 }
6133
6134 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)6135 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6136 {
6137 uint64_t urgency_param1 = 0, urgency_param2 = 0;
6138 task_t task = get_threadtask_early(thread);
6139
6140 thread_urgency_t urgency;
6141
6142 if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6143 urgency_param1 = 0;
6144 urgency_param2 = 0;
6145
6146 urgency = THREAD_URGENCY_NONE;
6147 } else if (thread->sched_mode == TH_MODE_REALTIME) {
6148 urgency_param1 = thread->realtime.period;
6149 urgency_param2 = thread->realtime.deadline;
6150
6151 urgency = THREAD_URGENCY_REAL_TIME;
6152 } else if (cpu_throttle_enabled &&
6153 (thread->sched_pri <= MAXPRI_THROTTLE) &&
6154 (thread->base_pri <= MAXPRI_THROTTLE)) {
6155 /*
6156 * Threads that are running at low priority but are not
6157 * tagged with a specific QoS are separated out from
6158 * the "background" urgency. Performance management
6159 * subsystem can decide to either treat these threads
6160 * as normal threads or look at other signals like thermal
6161 * levels for optimal power/perf tradeoffs for a platform.
6162 */
6163 boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6164 boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6165
6166 /*
6167 * Background urgency applied when thread priority is
6168 * MAXPRI_THROTTLE or lower and thread is not promoted
6169 * and thread has a QoS specified
6170 */
6171 urgency_param1 = thread->sched_pri;
6172 urgency_param2 = thread->base_pri;
6173
6174 if (thread_lacks_qos && !task_is_suppressed) {
6175 urgency = THREAD_URGENCY_LOWPRI;
6176 } else {
6177 urgency = THREAD_URGENCY_BACKGROUND;
6178 }
6179 } else {
6180 /* For otherwise unclassified threads, report throughput QoS parameters */
6181 urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6182 urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6183 urgency = THREAD_URGENCY_NORMAL;
6184 }
6185
6186 if (arg1 != NULL) {
6187 *arg1 = urgency_param1;
6188 }
6189 if (arg2 != NULL) {
6190 *arg2 = urgency_param2;
6191 }
6192
6193 return urgency;
6194 }
6195
6196 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)6197 thread_get_perfcontrol_class(thread_t thread)
6198 {
6199 /* Special case handling */
6200 if (thread->state & TH_IDLE) {
6201 return PERFCONTROL_CLASS_IDLE;
6202 }
6203
6204 if (thread->sched_mode == TH_MODE_REALTIME) {
6205 return PERFCONTROL_CLASS_REALTIME;
6206 }
6207
6208 /* perfcontrol_class based on base_pri */
6209 if (thread->base_pri <= MAXPRI_THROTTLE) {
6210 return PERFCONTROL_CLASS_BACKGROUND;
6211 } else if (thread->base_pri <= BASEPRI_UTILITY) {
6212 return PERFCONTROL_CLASS_UTILITY;
6213 } else if (thread->base_pri <= BASEPRI_DEFAULT) {
6214 return PERFCONTROL_CLASS_NONUI;
6215 } else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6216 return PERFCONTROL_CLASS_UI;
6217 } else {
6218 if (get_threadtask(thread) == kernel_task) {
6219 /*
6220 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6221 * All other lower priority kernel threads should be treated
6222 * as regular threads for performance control purposes.
6223 */
6224 return PERFCONTROL_CLASS_KERNEL;
6225 }
6226 return PERFCONTROL_CLASS_ABOVEUI;
6227 }
6228 }
6229
6230 /*
6231 * This is the processor idle loop, which just looks for other threads
6232 * to execute. Processor idle threads invoke this without supplying a
6233 * current thread to idle without an asserted wait state.
6234 *
6235 * Returns a the next thread to execute if dispatched directly.
6236 */
6237
6238 #if 0
6239 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6240 #else
6241 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6242 #endif
6243
6244 #if (DEVELOPMENT || DEBUG)
6245 int sched_idle_delay_cpuid = -1;
6246 #endif
6247
6248 thread_t
processor_idle(thread_t thread,processor_t processor)6249 processor_idle(
6250 thread_t thread,
6251 processor_t processor)
6252 {
6253 processor_set_t pset = processor->processor_set;
6254 struct recount_snap snap = { 0 };
6255
6256 (void)splsched();
6257
6258 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6259 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6260 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6261
6262 SCHED_STATS_INC(idle_transitions);
6263 assert(processor->running_timers_active == false);
6264
6265 recount_snapshot(&snap);
6266 recount_processor_idle(&processor->pr_recount, &snap);
6267 cpu_quiescent_counter_leave(snap.rsn_time_mach);
6268
6269 while (1) {
6270 /*
6271 * Ensure that updates to my processor and pset state,
6272 * made by the IPI source processor before sending the IPI,
6273 * are visible on this processor now (even though we don't
6274 * take the pset lock yet).
6275 */
6276 atomic_thread_fence(memory_order_acquire);
6277
6278 if (processor->state != PROCESSOR_IDLE) {
6279 break;
6280 }
6281 if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6282 break;
6283 }
6284 #if defined(CONFIG_SCHED_DEFERRED_AST)
6285 if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6286 break;
6287 }
6288 #endif
6289 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6290 break;
6291 }
6292
6293 if (processor->is_recommended && (processor->processor_primary == processor)) {
6294 if (rt_runq_count(pset)) {
6295 break;
6296 }
6297 } else {
6298 if (SCHED(processor_bound_count)(processor)) {
6299 break;
6300 }
6301 }
6302
6303 IDLE_KERNEL_DEBUG_CONSTANT(
6304 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6305
6306 machine_track_platform_idle(TRUE);
6307
6308 machine_idle();
6309 /* returns with interrupts enabled */
6310
6311 machine_track_platform_idle(FALSE);
6312
6313 #if (DEVELOPMENT || DEBUG)
6314 if (processor->cpu_id == sched_idle_delay_cpuid) {
6315 delay(500);
6316 }
6317 #endif
6318
6319 (void)splsched();
6320
6321 atomic_thread_fence(memory_order_acquire);
6322
6323 IDLE_KERNEL_DEBUG_CONSTANT(
6324 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6325
6326 /*
6327 * Check if we should call sched_timeshare_consider_maintenance() here.
6328 * The CPU was woken out of idle due to an interrupt and we should do the
6329 * call only if the processor is still idle. If the processor is non-idle,
6330 * the threads running on the processor would do the call as part of
6331 * context swithing.
6332 */
6333 if (processor->state == PROCESSOR_IDLE) {
6334 sched_timeshare_consider_maintenance(mach_absolute_time());
6335 }
6336
6337 if (!SCHED(processor_queue_empty)(processor)) {
6338 /* Secondary SMT processors respond to directed wakeups
6339 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6340 */
6341 if (processor->processor_primary == processor) {
6342 break;
6343 }
6344 }
6345 }
6346
6347 recount_snapshot(&snap);
6348 recount_processor_run(&processor->pr_recount, &snap);
6349 cpu_quiescent_counter_join(snap.rsn_time_mach);
6350
6351 ast_t reason = AST_NONE;
6352
6353 /* We're handling all scheduling AST's */
6354 ast_off(AST_SCHEDULING);
6355
6356 /*
6357 * thread_select will move the processor from dispatching to running,
6358 * or put it in idle if there's nothing to do.
6359 */
6360 thread_t cur_thread = current_thread();
6361
6362 thread_lock(cur_thread);
6363 thread_t new_thread = thread_select(cur_thread, processor, &reason);
6364 thread_unlock(cur_thread);
6365
6366 assert(processor->running_timers_active == false);
6367
6368 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6369 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6370 (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6371
6372 return new_thread;
6373 }
6374
6375 /*
6376 * Each processor has a dedicated thread which
6377 * executes the idle loop when there is no suitable
6378 * previous context.
6379 *
6380 * This continuation is entered with interrupts disabled.
6381 */
6382 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)6383 idle_thread(__assert_only void* parameter,
6384 __unused wait_result_t result)
6385 {
6386 assert(ml_get_interrupts_enabled() == FALSE);
6387 assert(parameter == NULL);
6388
6389 processor_t processor = current_processor();
6390
6391 /*
6392 * Ensure that anything running in idle context triggers
6393 * preemption-disabled checks.
6394 */
6395 disable_preemption_without_measurements();
6396
6397 /*
6398 * Enable interrupts temporarily to handle any pending interrupts
6399 * or IPIs before deciding to sleep
6400 */
6401 spllo();
6402
6403 thread_t new_thread = processor_idle(THREAD_NULL, processor);
6404 /* returns with interrupts disabled */
6405
6406 enable_preemption();
6407
6408 if (new_thread != THREAD_NULL) {
6409 thread_run(processor->idle_thread,
6410 idle_thread, NULL, new_thread);
6411 /*NOTREACHED*/
6412 }
6413
6414 thread_block(idle_thread);
6415 /*NOTREACHED*/
6416 }
6417
6418 kern_return_t
idle_thread_create(processor_t processor)6419 idle_thread_create(
6420 processor_t processor)
6421 {
6422 kern_return_t result;
6423 thread_t thread;
6424 spl_t s;
6425 char name[MAXTHREADNAMESIZE];
6426
6427 result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
6428 if (result != KERN_SUCCESS) {
6429 return result;
6430 }
6431
6432 snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6433 thread_set_thread_name(thread, name);
6434
6435 s = splsched();
6436 thread_lock(thread);
6437 thread->bound_processor = processor;
6438 processor->idle_thread = thread;
6439 thread->sched_pri = thread->base_pri = IDLEPRI;
6440 thread->state = (TH_RUN | TH_IDLE);
6441 thread->options |= TH_OPT_IDLE_THREAD;
6442 thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6443 thread_unlock(thread);
6444 splx(s);
6445
6446 thread_deallocate(thread);
6447
6448 return KERN_SUCCESS;
6449 }
6450
6451 static void sched_update_powered_cores_continue(void);
6452
6453 /*
6454 * sched_startup:
6455 *
6456 * Kicks off scheduler services.
6457 *
6458 * Called at splsched.
6459 */
6460 void
sched_startup(void)6461 sched_startup(void)
6462 {
6463 kern_return_t result;
6464 thread_t thread;
6465
6466 simple_lock_init(&sched_vm_group_list_lock, 0);
6467
6468 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
6469 NULL, MAXPRI_KERNEL, &thread);
6470 if (result != KERN_SUCCESS) {
6471 panic("sched_startup");
6472 }
6473
6474 thread_deallocate(thread);
6475
6476 assert_thread_magic(thread);
6477
6478 /*
6479 * Yield to the sched_init_thread once, to
6480 * initialize our own thread after being switched
6481 * back to.
6482 *
6483 * The current thread is the only other thread
6484 * active at this point.
6485 */
6486 thread_block(THREAD_CONTINUE_NULL);
6487
6488 result = kernel_thread_start_priority((thread_continue_t)sched_update_powered_cores_continue,
6489 NULL, MAXPRI_KERNEL, &thread);
6490 if (result != KERN_SUCCESS) {
6491 panic("sched_startup");
6492 }
6493
6494 thread_deallocate(thread);
6495
6496 assert_thread_magic(thread);
6497 }
6498
6499 #if __arm64__
6500 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6501 #endif /* __arm64__ */
6502
6503
6504 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6505
6506 static volatile uint64_t sched_maintenance_deadline;
6507 static uint64_t sched_tick_last_abstime;
6508 static uint64_t sched_tick_delta;
6509 uint64_t sched_tick_max_delta;
6510
6511
6512 /*
6513 * sched_init_thread:
6514 *
6515 * Perform periodic bookkeeping functions about ten
6516 * times per second.
6517 */
6518 void
sched_timeshare_maintenance_continue(void)6519 sched_timeshare_maintenance_continue(void)
6520 {
6521 uint64_t sched_tick_ctime, late_time;
6522
6523 struct sched_update_scan_context scan_context = {
6524 .earliest_bg_make_runnable_time = UINT64_MAX,
6525 .earliest_normal_make_runnable_time = UINT64_MAX,
6526 .earliest_rt_make_runnable_time = UINT64_MAX
6527 };
6528
6529 sched_tick_ctime = mach_absolute_time();
6530
6531 if (__improbable(sched_tick_last_abstime == 0)) {
6532 sched_tick_last_abstime = sched_tick_ctime;
6533 late_time = 0;
6534 sched_tick_delta = 1;
6535 } else {
6536 late_time = sched_tick_ctime - sched_tick_last_abstime;
6537 sched_tick_delta = late_time / sched_tick_interval;
6538 /* Ensure a delta of 1, since the interval could be slightly
6539 * smaller than the sched_tick_interval due to dispatch
6540 * latencies.
6541 */
6542 sched_tick_delta = MAX(sched_tick_delta, 1);
6543
6544 /* In the event interrupt latencies or platform
6545 * idle events that advanced the timebase resulted
6546 * in periods where no threads were dispatched,
6547 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6548 * iterations.
6549 */
6550 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6551
6552 sched_tick_last_abstime = sched_tick_ctime;
6553 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6554 }
6555
6556 scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6557 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6558 sched_tick_delta, late_time, 0, 0, 0);
6559
6560 /* Add a number of pseudo-ticks corresponding to the elapsed interval
6561 * This could be greater than 1 if substantial intervals where
6562 * all processors are idle occur, which rarely occurs in practice.
6563 */
6564
6565 sched_tick += sched_tick_delta;
6566
6567 update_vm_info();
6568
6569 /*
6570 * Compute various averages.
6571 */
6572 compute_averages(sched_tick_delta);
6573
6574 /*
6575 * Scan the run queues for threads which
6576 * may need to be updated, and find the earliest runnable thread on the runqueue
6577 * to report its latency.
6578 */
6579 SCHED(thread_update_scan)(&scan_context);
6580
6581 SCHED(rt_runq_scan)(&scan_context);
6582
6583 uint64_t ctime = mach_absolute_time();
6584
6585 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6586 ctime - scan_context.earliest_bg_make_runnable_time : 0;
6587
6588 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6589 ctime - scan_context.earliest_normal_make_runnable_time : 0;
6590
6591 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6592 ctime - scan_context.earliest_rt_make_runnable_time : 0;
6593
6594 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6595
6596 /*
6597 * Check to see if the special sched VM group needs attention.
6598 */
6599 sched_vm_group_maintenance();
6600
6601 #if __arm64__
6602 /* Check to see if the recommended cores failsafe is active */
6603 sched_recommended_cores_maintenance();
6604 #endif /* __arm64__ */
6605
6606
6607 #if DEBUG || DEVELOPMENT
6608 #if __x86_64__
6609 #include <i386/misc_protos.h>
6610 /* Check for long-duration interrupts */
6611 mp_interrupt_watchdog();
6612 #endif /* __x86_64__ */
6613 #endif /* DEBUG || DEVELOPMENT */
6614
6615 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6616 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6617 sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6618
6619 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6620 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
6621 /*NOTREACHED*/
6622 }
6623
6624 static uint64_t sched_maintenance_wakeups;
6625
6626 /*
6627 * Determine if the set of routines formerly driven by a maintenance timer
6628 * must be invoked, based on a deadline comparison. Signals the scheduler
6629 * maintenance thread on deadline expiration. Must be invoked at an interval
6630 * lower than the "sched_tick_interval", currently accomplished by
6631 * invocation via the quantum expiration timer and at context switch time.
6632 * Performance matters: this routine reuses a timestamp approximating the
6633 * current absolute time received from the caller, and should perform
6634 * no more than a comparison against the deadline in the common case.
6635 */
6636 void
sched_timeshare_consider_maintenance(uint64_t ctime)6637 sched_timeshare_consider_maintenance(uint64_t ctime)
6638 {
6639 cpu_quiescent_counter_checkin(ctime);
6640
6641 uint64_t deadline = sched_maintenance_deadline;
6642
6643 if (__improbable(ctime >= deadline)) {
6644 if (__improbable(current_thread() == sched_maintenance_thread)) {
6645 return;
6646 }
6647 OSMemoryBarrier();
6648
6649 uint64_t ndeadline = ctime + sched_tick_interval;
6650
6651 if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6652 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
6653 sched_maintenance_wakeups++;
6654 }
6655 }
6656
6657 #if !CONFIG_SCHED_CLUTCH
6658 /*
6659 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
6660 * scheduler, the load is maintained at the thread group and bucket level.
6661 */
6662 uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
6663
6664 if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
6665 uint64_t new_deadline = 0;
6666 if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
6667 compute_sched_load();
6668 new_deadline = ctime + sched_load_compute_interval_abs;
6669 os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
6670 }
6671 }
6672 #endif /* CONFIG_SCHED_CLUTCH */
6673
6674 #if __arm64__
6675 uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
6676
6677 if (__improbable(perf_deadline && ctime >= perf_deadline)) {
6678 /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
6679 if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
6680 machine_perfcontrol_deadline_passed(perf_deadline);
6681 }
6682 }
6683 #endif /* __arm64__ */
6684 }
6685
6686 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6687
6688 void
sched_init_thread(void)6689 sched_init_thread(void)
6690 {
6691 thread_block(THREAD_CONTINUE_NULL);
6692
6693 thread_t thread = current_thread();
6694
6695 thread_set_thread_name(thread, "sched_maintenance_thread");
6696
6697 sched_maintenance_thread = thread;
6698
6699 SCHED(maintenance_continuation)();
6700
6701 /*NOTREACHED*/
6702 }
6703
6704 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6705
6706 /*
6707 * thread_update_scan / runq_scan:
6708 *
6709 * Scan the run queues to account for timesharing threads
6710 * which need to be updated.
6711 *
6712 * Scanner runs in two passes. Pass one squirrels likely
6713 * threads away in an array, pass two does the update.
6714 *
6715 * This is necessary because the run queue is locked for
6716 * the candidate scan, but the thread is locked for the update.
6717 *
6718 * Array should be sized to make forward progress, without
6719 * disabling preemption for long periods.
6720 */
6721
6722 #define THREAD_UPDATE_SIZE 128
6723
6724 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
6725 static uint32_t thread_update_count = 0;
6726
6727 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
6728 boolean_t
thread_update_add_thread(thread_t thread)6729 thread_update_add_thread(thread_t thread)
6730 {
6731 if (thread_update_count == THREAD_UPDATE_SIZE) {
6732 return FALSE;
6733 }
6734
6735 thread_update_array[thread_update_count++] = thread;
6736 thread_reference(thread);
6737 return TRUE;
6738 }
6739
6740 void
thread_update_process_threads(void)6741 thread_update_process_threads(void)
6742 {
6743 assert(thread_update_count <= THREAD_UPDATE_SIZE);
6744
6745 for (uint32_t i = 0; i < thread_update_count; i++) {
6746 thread_t thread = thread_update_array[i];
6747 assert_thread_magic(thread);
6748 thread_update_array[i] = THREAD_NULL;
6749
6750 spl_t s = splsched();
6751 thread_lock(thread);
6752 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
6753 SCHED(update_priority)(thread);
6754 }
6755 thread_unlock(thread);
6756 splx(s);
6757
6758 thread_deallocate(thread);
6759 }
6760
6761 thread_update_count = 0;
6762 }
6763
6764 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)6765 runq_scan_thread(
6766 thread_t thread,
6767 sched_update_scan_context_t scan_context)
6768 {
6769 assert_thread_magic(thread);
6770
6771 if (thread->sched_stamp != sched_tick &&
6772 thread->sched_mode == TH_MODE_TIMESHARE) {
6773 if (thread_update_add_thread(thread) == FALSE) {
6774 return TRUE;
6775 }
6776 }
6777
6778 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
6779 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
6780 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
6781 }
6782 } else {
6783 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
6784 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
6785 }
6786 }
6787
6788 return FALSE;
6789 }
6790
6791 /*
6792 * Scan a runq for candidate threads.
6793 *
6794 * Returns TRUE if retry is needed.
6795 */
6796 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)6797 runq_scan(
6798 run_queue_t runq,
6799 sched_update_scan_context_t scan_context)
6800 {
6801 int count = runq->count;
6802 int queue_index;
6803
6804 assert(count >= 0);
6805
6806 if (count == 0) {
6807 return FALSE;
6808 }
6809
6810 for (queue_index = bitmap_first(runq->bitmap, NRQS);
6811 queue_index >= 0;
6812 queue_index = bitmap_next(runq->bitmap, queue_index)) {
6813 thread_t thread;
6814 circle_queue_t queue = &runq->queues[queue_index];
6815
6816 cqe_foreach_element(thread, queue, runq_links) {
6817 assert(count > 0);
6818 if (runq_scan_thread(thread, scan_context) == TRUE) {
6819 return TRUE;
6820 }
6821 count--;
6822 }
6823 }
6824
6825 return FALSE;
6826 }
6827
6828 #if CONFIG_SCHED_CLUTCH
6829
6830 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)6831 sched_clutch_timeshare_scan(
6832 queue_t thread_queue,
6833 uint16_t thread_count,
6834 sched_update_scan_context_t scan_context)
6835 {
6836 if (thread_count == 0) {
6837 return FALSE;
6838 }
6839
6840 thread_t thread;
6841 qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
6842 if (runq_scan_thread(thread, scan_context) == TRUE) {
6843 return TRUE;
6844 }
6845 thread_count--;
6846 }
6847
6848 assert(thread_count == 0);
6849 return FALSE;
6850 }
6851
6852
6853 #endif /* CONFIG_SCHED_CLUTCH */
6854
6855 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6856
6857 bool
thread_is_eager_preempt(thread_t thread)6858 thread_is_eager_preempt(thread_t thread)
6859 {
6860 return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
6861 }
6862
6863 void
thread_set_eager_preempt(thread_t thread)6864 thread_set_eager_preempt(thread_t thread)
6865 {
6866 spl_t s = splsched();
6867 thread_lock(thread);
6868
6869 assert(!thread_is_eager_preempt(thread));
6870
6871 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
6872
6873 if (thread == current_thread()) {
6874 /* csw_check updates current_is_eagerpreempt on the processor */
6875 ast_t ast = csw_check(thread, current_processor(), AST_NONE);
6876
6877 thread_unlock(thread);
6878
6879 if (ast != AST_NONE) {
6880 thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
6881 }
6882 } else {
6883 processor_t last_processor = thread->last_processor;
6884
6885 if (last_processor != PROCESSOR_NULL &&
6886 last_processor->state == PROCESSOR_RUNNING &&
6887 last_processor->active_thread == thread) {
6888 cause_ast_check(last_processor);
6889 }
6890
6891 thread_unlock(thread);
6892 }
6893
6894 splx(s);
6895 }
6896
6897 void
thread_clear_eager_preempt(thread_t thread)6898 thread_clear_eager_preempt(thread_t thread)
6899 {
6900 spl_t s = splsched();
6901 thread_lock(thread);
6902
6903 assert(thread_is_eager_preempt(thread));
6904
6905 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
6906
6907 if (thread == current_thread()) {
6908 current_processor()->current_is_eagerpreempt = false;
6909 }
6910
6911 thread_unlock(thread);
6912 splx(s);
6913 }
6914
6915 /*
6916 * Scheduling statistics
6917 */
6918 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)6919 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
6920 {
6921 struct sched_statistics *stats;
6922 boolean_t to_realtime = FALSE;
6923
6924 stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
6925 stats->csw_count++;
6926
6927 if (otherpri >= BASEPRI_REALTIME) {
6928 stats->rt_sched_count++;
6929 to_realtime = TRUE;
6930 }
6931
6932 if ((reasons & AST_PREEMPT) != 0) {
6933 stats->preempt_count++;
6934
6935 if (selfpri >= BASEPRI_REALTIME) {
6936 stats->preempted_rt_count++;
6937 }
6938
6939 if (to_realtime) {
6940 stats->preempted_by_rt_count++;
6941 }
6942 }
6943 }
6944
6945 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)6946 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
6947 {
6948 uint64_t timestamp = mach_absolute_time();
6949
6950 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
6951 stats->last_change_timestamp = timestamp;
6952 }
6953
6954 /*
6955 * For calls from assembly code
6956 */
6957 #undef thread_wakeup
6958 void
6959 thread_wakeup(
6960 event_t x);
6961
6962 void
thread_wakeup(event_t x)6963 thread_wakeup(
6964 event_t x)
6965 {
6966 thread_wakeup_with_result(x, THREAD_AWAKENED);
6967 }
6968
6969 boolean_t
preemption_enabled(void)6970 preemption_enabled(void)
6971 {
6972 return get_preemption_level() == 0 && ml_get_interrupts_enabled();
6973 }
6974
6975 static void
sched_timer_deadline_tracking_init(void)6976 sched_timer_deadline_tracking_init(void)
6977 {
6978 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
6979 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
6980 }
6981
6982 static uint64_t latest_requested_powered_cores = ALL_CORES_POWERED;
6983 processor_reason_t latest_requested_reason = REASON_NONE;
6984 static uint64_t current_requested_powered_cores = ALL_CORES_POWERED;
6985 bool perfcontrol_sleep_override = false;
6986
6987 LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown");
6988 LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp);
6989 int32_t cluster_powerdown_suspend_count = 0;
6990
6991 bool
sched_is_in_sleep(void)6992 sched_is_in_sleep(void)
6993 {
6994 os_atomic_thread_fence(acquire);
6995 return perfcontrol_sleep_override;
6996 }
6997
6998 static void
sched_update_powered_cores_continue(void)6999 sched_update_powered_cores_continue(void)
7000 {
7001 lck_mtx_lock(&cluster_powerdown_lock);
7002
7003 if (!cluster_powerdown_suspend_count) {
7004 spl_t s = splsched();
7005 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7006
7007 uint64_t latest = latest_requested_powered_cores;
7008 processor_reason_t reason = latest_requested_reason;
7009 uint64_t current = current_requested_powered_cores;
7010 current_requested_powered_cores = latest;
7011 bool in_sleep = perfcontrol_sleep_override;
7012
7013 simple_unlock(&sched_available_cores_lock);
7014 splx(s);
7015
7016 while (latest != current) {
7017 if (!in_sleep) {
7018 assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7019 sched_update_powered_cores(latest, reason, SHUTDOWN_TEMPORARY | WAIT_FOR_LAST_START);
7020 }
7021
7022 s = splsched();
7023 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7024
7025 latest = latest_requested_powered_cores;
7026 reason = latest_requested_reason;
7027 current = current_requested_powered_cores;
7028 current_requested_powered_cores = latest;
7029 in_sleep = perfcontrol_sleep_override;
7030
7031 simple_unlock(&sched_available_cores_lock);
7032 splx(s);
7033 }
7034
7035 assert_wait((event_t)sched_update_powered_cores_continue, THREAD_UNINT);
7036
7037 s = splsched();
7038 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7039 if (latest_requested_powered_cores != current_requested_powered_cores) {
7040 clear_wait(current_thread(), THREAD_AWAKENED);
7041 }
7042 simple_unlock(&sched_available_cores_lock);
7043 splx(s);
7044 }
7045
7046 lck_mtx_unlock(&cluster_powerdown_lock);
7047
7048 thread_block((thread_continue_t)sched_update_powered_cores_continue);
7049 /*NOTREACHED*/
7050 }
7051
7052 void
sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,__unused uint32_t flags)7053 sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, __unused uint32_t flags)
7054 {
7055 assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7056
7057 #if DEVELOPMENT || DEBUG
7058 if (flags & (ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED)) {
7059 if (flags & ASSERT_POWERDOWN_SUSPENDED) {
7060 assert(cluster_powerdown_suspend_count > 0);
7061 }
7062 if (flags & ASSERT_IN_SLEEP) {
7063 assert(perfcontrol_sleep_override == true);
7064 }
7065 return;
7066 }
7067 #endif
7068
7069 spl_t s = splsched();
7070 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7071
7072 bool should_wakeup = !cluster_powerdown_suspend_count;
7073 if (should_wakeup) {
7074 latest_requested_powered_cores = requested_powered_cores;
7075 latest_requested_reason = reason;
7076 }
7077
7078 simple_unlock(&sched_available_cores_lock);
7079 splx(s);
7080
7081 if (should_wakeup) {
7082 thread_wakeup((event_t)sched_update_powered_cores_continue);
7083 }
7084 }
7085
7086 void
suspend_cluster_powerdown(void)7087 suspend_cluster_powerdown(void)
7088 {
7089 lck_mtx_lock(&cluster_powerdown_lock);
7090
7091 assert(cluster_powerdown_suspend_count >= 0);
7092
7093 bool first_suspend = (cluster_powerdown_suspend_count == 0);
7094 if (first_suspend) {
7095 spl_t s = splsched();
7096 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7097 latest_requested_powered_cores = ALL_CORES_POWERED;
7098 current_requested_powered_cores = ALL_CORES_POWERED;
7099 latest_requested_reason = REASON_SYSTEM;
7100 simple_unlock(&sched_available_cores_lock);
7101 splx(s);
7102 }
7103
7104 cluster_powerdown_suspend_count++;
7105
7106 if (first_suspend) {
7107 kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START)\n", __FUNCTION__);
7108 sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START);
7109 }
7110
7111 lck_mtx_unlock(&cluster_powerdown_lock);
7112 }
7113
7114 void
resume_cluster_powerdown(void)7115 resume_cluster_powerdown(void)
7116 {
7117 lck_mtx_lock(&cluster_powerdown_lock);
7118
7119 if (cluster_powerdown_suspend_count <= 0) {
7120 panic("resume_cluster_powerdown() called with cluster_powerdown_suspend_count=%d\n", cluster_powerdown_suspend_count);
7121 }
7122
7123 cluster_powerdown_suspend_count--;
7124
7125 bool last_resume = (cluster_powerdown_suspend_count == 0);
7126
7127 if (last_resume) {
7128 spl_t s = splsched();
7129 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7130 latest_requested_powered_cores = ALL_CORES_POWERED;
7131 current_requested_powered_cores = ALL_CORES_POWERED;
7132 latest_requested_reason = REASON_SYSTEM;
7133 simple_unlock(&sched_available_cores_lock);
7134 splx(s);
7135
7136 kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE)\n", __FUNCTION__);
7137 sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE);
7138 }
7139
7140 lck_mtx_unlock(&cluster_powerdown_lock);
7141 }
7142
7143 LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp);
7144 static bool user_suspended_cluster_powerdown = false;
7145
7146 kern_return_t
suspend_cluster_powerdown_from_user(void)7147 suspend_cluster_powerdown_from_user(void)
7148 {
7149 kern_return_t ret = KERN_FAILURE;
7150
7151 lck_mtx_lock(&user_cluster_powerdown_lock);
7152
7153 if (!user_suspended_cluster_powerdown) {
7154 suspend_cluster_powerdown();
7155 user_suspended_cluster_powerdown = true;
7156 ret = KERN_SUCCESS;
7157 }
7158
7159 lck_mtx_unlock(&user_cluster_powerdown_lock);
7160
7161 return ret;
7162 }
7163
7164 kern_return_t
resume_cluster_powerdown_from_user(void)7165 resume_cluster_powerdown_from_user(void)
7166 {
7167 kern_return_t ret = KERN_FAILURE;
7168
7169 lck_mtx_lock(&user_cluster_powerdown_lock);
7170
7171 if (user_suspended_cluster_powerdown) {
7172 resume_cluster_powerdown();
7173 user_suspended_cluster_powerdown = false;
7174 ret = KERN_SUCCESS;
7175 }
7176
7177 lck_mtx_unlock(&user_cluster_powerdown_lock);
7178
7179 return ret;
7180 }
7181
7182 int
get_cluster_powerdown_user_suspended(void)7183 get_cluster_powerdown_user_suspended(void)
7184 {
7185 lck_mtx_lock(&user_cluster_powerdown_lock);
7186
7187 int ret = (int)user_suspended_cluster_powerdown;
7188
7189 lck_mtx_unlock(&user_cluster_powerdown_lock);
7190
7191 return ret;
7192 }
7193
7194 #if DEVELOPMENT || DEBUG
7195 /* Functions to support the temporary sysctl */
7196 static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED;
7197 void
sched_set_powered_cores(int requested_powered_cores)7198 sched_set_powered_cores(int requested_powered_cores)
7199 {
7200 processor_reason_t reason = bit_test(requested_powered_cores, 31) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM;
7201 uint32_t flags = requested_powered_cores & 0x30000000;
7202
7203 saved_requested_powered_cores = requested_powered_cores;
7204
7205 requested_powered_cores = bits(requested_powered_cores, 28, 0);
7206
7207 sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags);
7208 }
7209 int
sched_get_powered_cores(void)7210 sched_get_powered_cores(void)
7211 {
7212 return (int)saved_requested_powered_cores;
7213 }
7214 #endif
7215
7216 /*
7217 * Ensure that all cores are powered and recommended before sleep
7218 */
7219 void
sched_override_available_cores_for_sleep(void)7220 sched_override_available_cores_for_sleep(void)
7221 {
7222 spl_t s = splsched();
7223 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7224
7225 if (perfcontrol_sleep_override == false) {
7226 perfcontrol_sleep_override = true;
7227 #if __arm__ || __arm64__
7228 sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7229 #endif
7230 }
7231
7232 simple_unlock(&sched_available_cores_lock);
7233 splx(s);
7234
7235 suspend_cluster_powerdown();
7236 }
7237
7238 /*
7239 * Restore the previously recommended cores, but leave all cores powered
7240 * after sleep
7241 */
7242 void
sched_restore_available_cores_after_sleep(void)7243 sched_restore_available_cores_after_sleep(void)
7244 {
7245 spl_t s = splsched();
7246 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7247
7248 if (perfcontrol_sleep_override == true) {
7249 perfcontrol_sleep_override = false;
7250 #if __arm__ || __arm64__
7251 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7252 REASON_NONE, 0);
7253 #endif
7254 }
7255
7256 simple_unlock(&sched_available_cores_lock);
7257 splx(s);
7258
7259 resume_cluster_powerdown();
7260 }
7261
7262 #if __arm__ || __arm64__
7263
7264 uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
7265 bool perfcontrol_failsafe_active = false;
7266
7267 uint64_t perfcontrol_failsafe_maintenance_runnable_time;
7268 uint64_t perfcontrol_failsafe_activation_time;
7269 uint64_t perfcontrol_failsafe_deactivation_time;
7270
7271 /* data covering who likely caused it and how long they ran */
7272 #define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */
7273 char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
7274 int perfcontrol_failsafe_pid;
7275 uint64_t perfcontrol_failsafe_tid;
7276 uint64_t perfcontrol_failsafe_thread_timer_at_start;
7277 uint64_t perfcontrol_failsafe_thread_timer_last_seen;
7278 uint64_t perfcontrol_failsafe_recommended_at_trigger;
7279
7280 /*
7281 * Perf controller calls here to update the recommended core bitmask.
7282 * If the failsafe is active, we don't immediately apply the new value.
7283 * Instead, we store the new request and use it after the failsafe deactivates.
7284 *
7285 * If the failsafe is not active, immediately apply the update.
7286 *
7287 * No scheduler locks are held, no other locks are held that scheduler might depend on,
7288 * interrupts are enabled
7289 *
7290 * currently prototype is in osfmk/arm/machine_routines.h
7291 */
7292 void
sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores,processor_reason_t reason,uint32_t flags)7293 sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags)
7294 {
7295 assert(preemption_enabled());
7296
7297 spl_t s = splsched();
7298 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7299
7300 if (reason == REASON_CLPC_SYSTEM) {
7301 perfcontrol_system_requested_recommended_cores = recommended_cores;
7302 } else {
7303 assert(reason == REASON_CLPC_USER);
7304 perfcontrol_user_requested_recommended_cores = recommended_cores;
7305 }
7306
7307 perfcontrol_requested_recommended_cores = perfcontrol_system_requested_recommended_cores & perfcontrol_user_requested_recommended_cores;
7308 perfcontrol_requested_recommended_core_count = __builtin_popcountll(perfcontrol_requested_recommended_cores);
7309
7310 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7311 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores, reason, flags);
7312 } else {
7313 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7314 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7315 perfcontrol_requested_recommended_cores,
7316 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7317 }
7318
7319 simple_unlock(&sched_available_cores_lock);
7320 splx(s);
7321 }
7322
7323 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)7324 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
7325 {
7326 sched_perfcontrol_update_recommended_cores_reason(recommended_cores, REASON_CLPC_USER, 0);
7327 }
7328
7329 /*
7330 * Consider whether we need to activate the recommended cores failsafe
7331 *
7332 * Called from quantum timer interrupt context of a realtime thread
7333 * No scheduler locks are held, interrupts are disabled
7334 */
7335 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)7336 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7337 {
7338 /*
7339 * Check if a realtime thread is starving the system
7340 * and bringing up non-recommended cores would help
7341 *
7342 * TODO: Is this the correct check for recommended == possible cores?
7343 * TODO: Validate the checks without the relevant lock are OK.
7344 */
7345
7346 if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7347 /* keep track of how long the responsible thread runs */
7348 uint64_t cur_th_time = recount_current_thread_time_mach();
7349
7350 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7351
7352 if (perfcontrol_failsafe_active == TRUE &&
7353 cur_thread->thread_id == perfcontrol_failsafe_tid) {
7354 perfcontrol_failsafe_thread_timer_last_seen = cur_th_time;
7355 }
7356
7357 simple_unlock(&sched_available_cores_lock);
7358
7359 /* we're already trying to solve the problem, so bail */
7360 return;
7361 }
7362
7363 /* The failsafe won't help if there are no more processors to enable */
7364 if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7365 return;
7366 }
7367
7368 uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7369
7370 /* Use the maintenance thread as our canary in the coal mine */
7371 thread_t m_thread = sched_maintenance_thread;
7372
7373 /* If it doesn't look bad, nothing to see here */
7374 if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7375 return;
7376 }
7377
7378 /* It looks bad, take the lock to be sure */
7379 thread_lock(m_thread);
7380
7381 if (m_thread->runq == PROCESSOR_NULL ||
7382 (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
7383 m_thread->last_made_runnable_time >= too_long_ago) {
7384 /*
7385 * Maintenance thread is either on cpu or blocked, and
7386 * therefore wouldn't benefit from more cores
7387 */
7388 thread_unlock(m_thread);
7389 return;
7390 }
7391
7392 uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7393
7394 thread_unlock(m_thread);
7395
7396 /*
7397 * There are cores disabled at perfcontrol's recommendation, but the
7398 * system is so overloaded that the maintenance thread can't run.
7399 * That likely means that perfcontrol can't run either, so it can't fix
7400 * the recommendation. We have to kick in a failsafe to keep from starving.
7401 *
7402 * When the maintenance thread has been starved for too long,
7403 * ignore the recommendation from perfcontrol and light up all the cores.
7404 *
7405 * TODO: Consider weird states like boot, sleep, or debugger
7406 */
7407
7408 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7409
7410 if (perfcontrol_failsafe_active == TRUE) {
7411 simple_unlock(&sched_available_cores_lock);
7412 return;
7413 }
7414
7415 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7416 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
7417 perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
7418
7419 perfcontrol_failsafe_active = TRUE;
7420 perfcontrol_failsafe_activation_time = mach_absolute_time();
7421 perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7422 perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7423
7424 /* Capture some data about who screwed up (assuming that the thread on core is at fault) */
7425 task_t task = get_threadtask(cur_thread);
7426 perfcontrol_failsafe_pid = task_pid(task);
7427 strlcpy(perfcontrol_failsafe_name, proc_name_address(get_bsdtask_info(task)), sizeof(perfcontrol_failsafe_name));
7428
7429 perfcontrol_failsafe_tid = cur_thread->thread_id;
7430
7431 /* Blame the thread for time it has run recently */
7432 uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7433
7434 uint64_t last_seen = recount_current_thread_time_mach();
7435
7436 /* Compute the start time of the bad behavior in terms of the thread's on core time */
7437 perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
7438 perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7439
7440 /* Ignore the previously recommended core configuration */
7441 sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7442
7443 simple_unlock(&sched_available_cores_lock);
7444 }
7445
7446 /*
7447 * Now that our bacon has been saved by the failsafe, consider whether to turn it off
7448 *
7449 * Runs in the context of the maintenance thread, no locks held
7450 */
7451 static void
sched_recommended_cores_maintenance(void)7452 sched_recommended_cores_maintenance(void)
7453 {
7454 /* Common case - no failsafe, nothing to be done here */
7455 if (__probable(perfcontrol_failsafe_active == FALSE)) {
7456 return;
7457 }
7458
7459 uint64_t ctime = mach_absolute_time();
7460
7461 boolean_t print_diagnostic = FALSE;
7462 char p_name[FAILSAFE_NAME_LEN] = "";
7463
7464 spl_t s = splsched();
7465 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7466
7467 /* Check again, under the lock, to avoid races */
7468 if (perfcontrol_failsafe_active == FALSE) {
7469 goto out;
7470 }
7471
7472 /*
7473 * Ensure that the other cores get another few ticks to run some threads
7474 * If we don't have this hysteresis, the maintenance thread is the first
7475 * to run, and then it immediately kills the other cores
7476 */
7477 if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7478 goto out;
7479 }
7480
7481 /* Capture some diagnostic state under the lock so we can print it out later */
7482
7483 int pid = perfcontrol_failsafe_pid;
7484 uint64_t tid = perfcontrol_failsafe_tid;
7485
7486 uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
7487 perfcontrol_failsafe_thread_timer_at_start;
7488 uint64_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
7489 uint64_t rec_cores_after = perfcontrol_requested_recommended_cores;
7490 uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
7491 strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
7492
7493 print_diagnostic = TRUE;
7494
7495 /* Deactivate the failsafe and reinstate the requested recommendation settings */
7496
7497 perfcontrol_failsafe_deactivation_time = ctime;
7498 perfcontrol_failsafe_active = FALSE;
7499
7500 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7501 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
7502 perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
7503
7504 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7505 REASON_NONE, 0);
7506
7507 out:
7508 simple_unlock(&sched_available_cores_lock);
7509 splx(s);
7510
7511 if (print_diagnostic) {
7512 uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
7513
7514 absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
7515 failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7516
7517 absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
7518 thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7519
7520 printf("recommended core failsafe kicked in for %lld ms "
7521 "likely due to %s[%d] thread 0x%llx spending "
7522 "%lld ms on cpu at realtime priority - "
7523 "new recommendation: 0x%llx -> 0x%llx\n",
7524 failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7525 rec_cores_before, rec_cores_after);
7526 }
7527 }
7528
7529 #endif /* __arm64__ */
7530
7531 kern_return_t
sched_processor_enable(processor_t processor,boolean_t enable)7532 sched_processor_enable(processor_t processor, boolean_t enable)
7533 {
7534 assert(preemption_enabled());
7535
7536 if (processor == master_processor) {
7537 /* The system can hang if this is allowed */
7538 return KERN_NOT_SUPPORTED;
7539 }
7540
7541 spl_t s = splsched();
7542 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7543
7544 if (enable) {
7545 bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7546 } else {
7547 bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7548 }
7549
7550 #if __arm64__
7551 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7552 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7553 REASON_USER, 0);
7554 } else {
7555 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7556 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7557 perfcontrol_requested_recommended_cores,
7558 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7559 }
7560 #else /* __arm64__ */
7561 sched_update_recommended_cores(usercontrol_requested_recommended_cores, REASON_USER, 0);
7562 #endif /* ! __arm64__ */
7563
7564 simple_unlock(&sched_available_cores_lock);
7565 splx(s);
7566
7567 return KERN_SUCCESS;
7568 }
7569
7570 void
sched_mark_processor_online_locked(processor_t processor,__assert_only processor_reason_t reason)7571 sched_mark_processor_online_locked(processor_t processor, __assert_only processor_reason_t reason)
7572 {
7573 assert((processor != master_processor) || (reason == REASON_SYSTEM));
7574
7575 bit_set(sched_online_processors, processor->cpu_id);
7576 }
7577
7578 kern_return_t
sched_mark_processor_offline(processor_t processor,processor_reason_t reason)7579 sched_mark_processor_offline(processor_t processor, processor_reason_t reason)
7580 {
7581 assert((processor != master_processor) || (reason == REASON_SYSTEM));
7582 kern_return_t ret = KERN_SUCCESS;
7583
7584 spl_t s = splsched();
7585 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7586
7587 if (reason == REASON_SYSTEM) {
7588 bit_clear(sched_online_processors, processor->cpu_id);
7589 simple_unlock(&sched_available_cores_lock);
7590 splx(s);
7591 return ret;
7592 }
7593
7594 uint64_t available_cores = sched_online_processors & perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores;
7595
7596 if (!bit_test(sched_online_processors, processor->cpu_id)) {
7597 /* Processor is already offline */
7598 ret = KERN_NOT_IN_SET;
7599 } else if (available_cores == BIT(processor->cpu_id)) {
7600 ret = KERN_RESOURCE_SHORTAGE;
7601 } else {
7602 bit_clear(sched_online_processors, processor->cpu_id);
7603 ret = KERN_SUCCESS;
7604 }
7605
7606 simple_unlock(&sched_available_cores_lock);
7607 splx(s);
7608
7609 return ret;
7610 }
7611
7612 /*
7613 * Apply a new recommended cores mask to the processors it affects
7614 * Runs after considering failsafes and such
7615 *
7616 * Iterate over processors and update their ->is_recommended field.
7617 * If a processor is running, we let it drain out at its next
7618 * quantum expiration or blocking point. If a processor is idle, there
7619 * may be more work for it to do, so IPI it.
7620 *
7621 * interrupts disabled, sched_available_cores_lock is held
7622 */
7623 static void
sched_update_recommended_cores(uint64_t recommended_cores,processor_reason_t reason,__unused uint32_t flags)7624 sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, __unused uint32_t flags)
7625 {
7626 uint64_t needs_exit_idle_mask = 0x0;
7627
7628 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
7629 recommended_cores,
7630 #if __arm64__
7631 perfcontrol_failsafe_active, 0, 0);
7632 #else /* __arm64__ */
7633 0, 0, 0);
7634 #endif /* ! __arm64__ */
7635
7636 if (__builtin_popcountll(recommended_cores & sched_online_processors) == 0) {
7637 bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
7638 }
7639
7640 /* First set recommended cores */
7641 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7642 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7643 processor_set_t pset = pset_array[pset_id];
7644
7645 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7646 cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7647
7648 if (newly_recommended == 0) {
7649 /* Nothing to do */
7650 continue;
7651 }
7652
7653 pset_lock(pset);
7654
7655 for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0; cpu_id = lsb_next(newly_recommended, cpu_id)) {
7656 processor_t processor = processor_array[cpu_id];
7657 processor->is_recommended = TRUE;
7658 processor->last_recommend_reason = reason;
7659 bit_set(pset->recommended_bitmask, processor->cpu_id);
7660
7661 if (processor->state == PROCESSOR_IDLE) {
7662 if (processor != current_processor()) {
7663 bit_set(needs_exit_idle_mask, processor->cpu_id);
7664 }
7665 }
7666 if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
7667 os_atomic_inc(&processor_avail_count_user, relaxed);
7668 if (processor->processor_primary == processor) {
7669 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
7670 }
7671 SCHED(pset_made_schedulable)(processor, pset, false);
7672 }
7673 }
7674 pset_update_rt_stealable_state(pset);
7675
7676 pset_unlock(pset);
7677 }
7678 }
7679
7680 /* Now shutdown not recommended cores */
7681 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7682 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7683 processor_set_t pset = pset_array[pset_id];
7684
7685 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7686 cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
7687
7688 if (newly_unrecommended == 0) {
7689 /* Nothing to do */
7690 continue;
7691 }
7692
7693 pset_lock(pset);
7694
7695 for (int cpu_id = lsb_first(newly_unrecommended); cpu_id >= 0; cpu_id = lsb_next(newly_unrecommended, cpu_id)) {
7696 processor_t processor = processor_array[cpu_id];
7697 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
7698
7699 processor->is_recommended = FALSE;
7700 if (reason != REASON_NONE) {
7701 processor->last_derecommend_reason = reason;
7702 }
7703 bit_clear(pset->recommended_bitmask, processor->cpu_id);
7704 if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
7705 os_atomic_dec(&processor_avail_count_user, relaxed);
7706 if (processor->processor_primary == processor) {
7707 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
7708 }
7709 }
7710 pset_update_rt_stealable_state(pset);
7711
7712 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
7713 ipi_type = SCHED_IPI_IMMEDIATE;
7714 }
7715 SCHED(processor_queue_shutdown)(processor);
7716 /* pset unlocked */
7717
7718 SCHED(rt_queue_shutdown)(processor);
7719
7720 if (ipi_type != SCHED_IPI_NONE) {
7721 if (processor == current_processor()) {
7722 ast_on(AST_PREEMPT);
7723 } else {
7724 sched_ipi_perform(processor, ipi_type);
7725 }
7726 }
7727
7728 pset_lock(pset);
7729 }
7730 pset_unlock(pset);
7731 }
7732 }
7733
7734 #if defined(__x86_64__)
7735 commpage_update_active_cpus();
7736 #endif
7737 /* Issue all pending IPIs now that the pset lock has been dropped */
7738 for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
7739 processor_t processor = processor_array[cpuid];
7740 machine_signal_idle(processor);
7741 }
7742
7743 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
7744 needs_exit_idle_mask, 0, 0, 0);
7745 }
7746
7747 static void
sched_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,uint32_t flags)7748 sched_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, uint32_t flags)
7749 {
7750 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_START,
7751 requested_powered_cores, reason, flags, 0);
7752
7753 assert((flags & (LOCK_STATE | UNLOCK_STATE)) ? (reason == REASON_SYSTEM) && (requested_powered_cores == ALL_CORES_POWERED) : 1);
7754
7755 /*
7756 * Loop through newly set requested_powered_cores and start them.
7757 * Loop through newly cleared requested_powered_cores and shut them down.
7758 */
7759
7760 if ((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER)) {
7761 flags |= SHUTDOWN_TEMPORARY;
7762 }
7763
7764 /* First set powered cores */
7765 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7766 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7767 processor_set_t pset = pset_array[pset_id];
7768
7769 spl_t s = splsched();
7770 pset_lock(pset);
7771 cpumap_t pset_requested_powered_cores = requested_powered_cores & pset->cpu_bitmask;
7772 cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
7773 cpumap_t requested_changes = pset_requested_powered_cores ^ powered_cores;
7774 pset_unlock(pset);
7775 splx(s);
7776
7777 cpumap_t newly_powered = requested_changes & requested_powered_cores;
7778
7779 cpumap_t cpu_map = newly_powered;
7780
7781 if (flags & (LOCK_STATE | UNLOCK_STATE)) {
7782 /*
7783 * We need to change the lock state even if
7784 * we don't need to change the actual state.
7785 */
7786 cpu_map = pset_requested_powered_cores;
7787 /* But not the master_processor, which is always implicitly locked */
7788 bit_clear(cpu_map, master_processor->cpu_id);
7789 }
7790
7791 if (cpu_map == 0) {
7792 /* Nothing to do */
7793 continue;
7794 }
7795
7796 int last_start_cpu_id = bit_first(cpu_map);
7797
7798 for (int cpu_id = lsb_first(cpu_map); cpu_id >= 0; cpu_id = lsb_next(cpu_map, cpu_id)) {
7799 processor_t processor = processor_array[cpu_id];
7800
7801 if ((flags & WAIT_FOR_LAST_START) && (cpu_id == last_start_cpu_id)) {
7802 processor_start_reason(processor, reason, flags | WAIT_FOR_START);
7803 } else {
7804 processor_start_reason(processor, reason, flags);
7805 }
7806 }
7807 }
7808 }
7809
7810 /* Now shutdown not powered cores */
7811 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7812 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7813 processor_set_t pset = pset_array[pset_id];
7814
7815 spl_t s = splsched();
7816 pset_lock(pset);
7817 cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
7818 cpumap_t requested_changes = (requested_powered_cores & pset->cpu_bitmask) ^ powered_cores;
7819 pset_unlock(pset);
7820 splx(s);
7821
7822 cpumap_t newly_unpowered = requested_changes & ~requested_powered_cores;
7823
7824 if (newly_unpowered == 0) {
7825 /* Nothing to do */
7826 continue;
7827 }
7828
7829 for (int cpu_id = lsb_first(newly_unpowered); cpu_id >= 0; cpu_id = lsb_next(newly_unpowered, cpu_id)) {
7830 processor_t processor = processor_array[cpu_id];
7831
7832 processor_exit_reason(processor, reason, flags);
7833 }
7834 }
7835 }
7836
7837 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_END, 0, 0, 0, 0);
7838 }
7839
7840 void
thread_set_options(uint32_t thopt)7841 thread_set_options(uint32_t thopt)
7842 {
7843 spl_t x;
7844 thread_t t = current_thread();
7845
7846 x = splsched();
7847 thread_lock(t);
7848
7849 t->options |= thopt;
7850
7851 thread_unlock(t);
7852 splx(x);
7853 }
7854
7855 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)7856 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
7857 {
7858 thread->pending_block_hint = block_hint;
7859 }
7860
7861 uint32_t
qos_max_parallelism(int qos,uint64_t options)7862 qos_max_parallelism(int qos, uint64_t options)
7863 {
7864 return SCHED(qos_max_parallelism)(qos, options);
7865 }
7866
7867 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)7868 sched_qos_max_parallelism(__unused int qos, uint64_t options)
7869 {
7870 host_basic_info_data_t hinfo;
7871 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
7872
7873
7874 /*
7875 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
7876 * implement their own qos_max_parallelism() interfaces.
7877 */
7878 assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
7879
7880 /* Query the machine layer for core information */
7881 __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
7882 (host_info_t)&hinfo, &count);
7883 assert(kret == KERN_SUCCESS);
7884
7885 if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
7886 return hinfo.logical_cpu;
7887 } else {
7888 return hinfo.physical_cpu;
7889 }
7890 }
7891
7892 int sched_allow_NO_SMT_threads = 1;
7893 bool
thread_no_smt(thread_t thread)7894 thread_no_smt(thread_t thread)
7895 {
7896 return sched_allow_NO_SMT_threads &&
7897 (thread->bound_processor == PROCESSOR_NULL) &&
7898 ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
7899 }
7900
7901 bool
processor_active_thread_no_smt(processor_t processor)7902 processor_active_thread_no_smt(processor_t processor)
7903 {
7904 return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
7905 }
7906
7907 #if __arm64__
7908
7909 /*
7910 * Set up or replace old timer with new timer
7911 *
7912 * Returns true if canceled old timer, false if it did not
7913 */
7914 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)7915 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
7916 {
7917 /*
7918 * Exchange deadline for new deadline, if old deadline was nonzero,
7919 * then I cancelled the callback, otherwise I didn't
7920 */
7921
7922 return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
7923 relaxed) != 0;
7924 }
7925
7926 /*
7927 * Set global SFI window (in usec)
7928 */
7929 kern_return_t
sched_perfcontrol_sfi_set_window(uint64_t window_usecs)7930 sched_perfcontrol_sfi_set_window(uint64_t window_usecs)
7931 {
7932 kern_return_t ret = KERN_NOT_SUPPORTED;
7933 #if CONFIG_THREAD_GROUPS
7934 if (window_usecs == 0ULL) {
7935 ret = sfi_window_cancel();
7936 } else {
7937 ret = sfi_set_window(window_usecs);
7938 }
7939 #endif // CONFIG_THREAD_GROUPS
7940 return ret;
7941 }
7942
7943 /*
7944 * Set background and maintenance SFI class offtimes
7945 */
7946 kern_return_t
sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)7947 sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)
7948 {
7949 kern_return_t ret = KERN_NOT_SUPPORTED;
7950 #if CONFIG_THREAD_GROUPS
7951 if (offtime_usecs == 0ULL) {
7952 ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE);
7953 ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG);
7954 } else {
7955 ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs);
7956 ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs);
7957 }
7958 #endif // CONFIG_THREAD_GROUPS
7959 return ret;
7960 }
7961
7962 /*
7963 * Set utility SFI class offtime
7964 */
7965 kern_return_t
sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)7966 sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)
7967 {
7968 kern_return_t ret = KERN_NOT_SUPPORTED;
7969 #if CONFIG_THREAD_GROUPS
7970 if (offtime_usecs == 0ULL) {
7971 ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY);
7972 } else {
7973 ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs);
7974 }
7975 #endif // CONFIG_THREAD_GROUPS
7976 return ret;
7977 }
7978
7979 #endif /* __arm64__ */
7980
7981 #if CONFIG_SCHED_EDGE
7982
7983 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
7984
7985 /*
7986 * sched_edge_pset_running_higher_bucket()
7987 *
7988 * Routine to calculate cumulative running counts for each scheduling
7989 * bucket. This effectively lets the load calculation calculate if a
7990 * cluster is running any threads at a QoS lower than the thread being
7991 * migrated etc.
7992 */
7993
7994 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)7995 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
7996 {
7997 bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
7998
7999 /* Edge Scheduler Optimization */
8000 for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
8001 sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
8002 for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
8003 running_higher[bucket]++;
8004 }
8005 }
8006 }
8007
8008 /*
8009 * sched_update_pset_load_average()
8010 *
8011 * Updates the load average for each sched bucket for a cluster.
8012 * This routine must be called with the pset lock held.
8013 */
8014 void
sched_update_pset_load_average(processor_set_t pset,uint64_t curtime)8015 sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
8016 {
8017 int avail_cpu_count = pset_available_cpu_count(pset);
8018 if (avail_cpu_count == 0) {
8019 /* Looks like the pset is not runnable any more; nothing to do here */
8020 return;
8021 }
8022
8023 /*
8024 * Edge Scheduler Optimization
8025 *
8026 * See if more callers of this routine can pass in timestamps to avoid the
8027 * mach_absolute_time() call here.
8028 */
8029
8030 if (!curtime) {
8031 curtime = mach_absolute_time();
8032 }
8033 uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
8034 int64_t delta_ticks = curtime - last_update;
8035 if (delta_ticks < 0) {
8036 return;
8037 }
8038
8039 uint64_t delta_nsecs = 0;
8040 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8041
8042 if (__improbable(delta_nsecs > UINT32_MAX)) {
8043 delta_nsecs = UINT32_MAX;
8044 }
8045
8046 #if CONFIG_SCHED_EDGE
8047 /* Update the shared resource load on the pset */
8048 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
8049 uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
8050 uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
8051 uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
8052 uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
8053 if (old_shared_load != new_shared_load) {
8054 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
8055 }
8056 }
8057 #endif /* CONFIG_SCHED_EDGE */
8058
8059 uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
8060 sched_edge_pset_running_higher_bucket(pset, running_higher);
8061
8062 for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
8063 uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
8064 uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
8065 uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
8066
8067 /*
8068 * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
8069 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
8070 * new load averga needs to be shifted before it can be added to the old load average.
8071 */
8072 uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
8073
8074 /*
8075 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
8076 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
8077 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
8078 */
8079 int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8080 boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
8081 boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
8082 uint64_t load_average;
8083 if (load_uptick || load_downtick) {
8084 load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8085 } else {
8086 /* Indicates a loaded system; use EWMA for load average calculation */
8087 load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8088 }
8089 os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
8090 if (load_average != old_load_average) {
8091 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
8092 }
8093 }
8094 os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
8095 }
8096
8097 void
sched_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)8098 sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
8099 {
8100 pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
8101 uint64_t avg_thread_execution_time = 0;
8102
8103 os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
8104 old_execution_time_packed.pset_execution_time_packed,
8105 new_execution_time_packed.pset_execution_time_packed, relaxed, {
8106 uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
8107 int64_t delta_ticks = curtime - last_update;
8108 if (delta_ticks < 0) {
8109 /*
8110 * Its possible that another CPU came in and updated the pset_execution_time
8111 * before this CPU could do it. Since the average execution time is meant to
8112 * be an approximate measure per cluster, ignore the older update.
8113 */
8114 os_atomic_rmw_loop_give_up(return );
8115 }
8116 uint64_t delta_nsecs = 0;
8117 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8118
8119 uint64_t nanotime = 0;
8120 absolutetime_to_nanoseconds(execution_time, &nanotime);
8121 uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
8122
8123 uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
8124 uint64_t new_execution_time = (execution_time_us * delta_nsecs);
8125
8126 avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8127 new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
8128 new_execution_time_packed.pset_execution_time_last_update = curtime;
8129 });
8130 if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
8131 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
8132 }
8133 }
8134
8135 uint64_t
sched_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)8136 sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
8137 {
8138 return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
8139 }
8140
8141 #else /* CONFIG_SCHED_EDGE */
8142
8143 void
sched_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)8144 sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
8145 {
8146 int non_rt_load = pset->pset_runq.count;
8147 int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
8148 int new_load_average = ((int)pset->load_average + load) >> 1;
8149
8150 pset->load_average = new_load_average;
8151 #if (DEVELOPMENT || DEBUG)
8152 #if __AMP__
8153 if (pset->pset_cluster_type == PSET_AMP_P) {
8154 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
8155 }
8156 #endif
8157 #endif
8158 }
8159
8160 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)8161 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
8162 {
8163 }
8164
8165 #endif /* CONFIG_SCHED_EDGE */
8166
8167 /* pset is locked */
8168 static bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)8169 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
8170 {
8171 int cpuid = processor->cpu_id;
8172 #if defined(__x86_64__)
8173 if (sched_avoid_cpu0 && (cpuid == 0)) {
8174 return false;
8175 }
8176 #endif
8177
8178 cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8179
8180 return bit_test(fasttrack_map, cpuid);
8181 }
8182
8183 /* pset is locked */
8184 static processor_t
choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)8185 choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
8186 {
8187 #if defined(__x86_64__)
8188 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8189 #else
8190 const bool avoid_cpu0 = false;
8191 #endif
8192 cpumap_t cpu_map;
8193
8194 try_again:
8195 cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8196 if (skip_processor) {
8197 bit_clear(cpu_map, skip_processor->cpu_id);
8198 }
8199 if (skip_spills) {
8200 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8201 }
8202
8203 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8204 bit_clear(cpu_map, 0);
8205 }
8206
8207 cpumap_t primary_map = cpu_map & pset->primary_map;
8208 if (avoid_cpu0) {
8209 primary_map = bit_ror64(primary_map, 1);
8210 }
8211
8212 int rotid = lsb_first(primary_map);
8213 if (rotid >= 0) {
8214 int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
8215
8216 processor_t processor = processor_array[cpuid];
8217
8218 return processor;
8219 }
8220
8221 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8222 goto out;
8223 }
8224
8225 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8226 /* Also avoid cpu1 */
8227 bit_clear(cpu_map, 1);
8228 }
8229
8230 /* Consider secondary processors whose primary is actually running a realtime thread */
8231 cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
8232 if (avoid_cpu0) {
8233 /* Also avoid cpu1 */
8234 secondary_map = bit_ror64(secondary_map, 2);
8235 }
8236 rotid = lsb_first(secondary_map);
8237 if (rotid >= 0) {
8238 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
8239
8240 processor_t processor = processor_array[cpuid];
8241
8242 return processor;
8243 }
8244
8245 /* Consider secondary processors */
8246 secondary_map = cpu_map & ~pset->primary_map;
8247 if (avoid_cpu0) {
8248 /* Also avoid cpu1 */
8249 secondary_map = bit_ror64(secondary_map, 2);
8250 }
8251 rotid = lsb_first(secondary_map);
8252 if (rotid >= 0) {
8253 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
8254
8255 processor_t processor = processor_array[cpuid];
8256
8257 return processor;
8258 }
8259
8260 /*
8261 * I was hoping the compiler would optimize
8262 * this away when avoid_cpu0 is const bool false
8263 * but it still complains about the assignmnent
8264 * in that case.
8265 */
8266 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8267 #if defined(__x86_64__)
8268 avoid_cpu0 = false;
8269 #else
8270 assert(0);
8271 #endif
8272 goto try_again;
8273 }
8274
8275 out:
8276 if (skip_processor) {
8277 return PROCESSOR_NULL;
8278 }
8279
8280 /*
8281 * If we didn't find an obvious processor to choose, but there are still more CPUs
8282 * not already running realtime threads than realtime threads in the realtime run queue,
8283 * this thread belongs in this pset, so choose some other processor in this pset
8284 * to ensure the thread is enqueued here.
8285 */
8286 cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8287 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8288 cpu_map = non_realtime_map;
8289 assert(cpu_map != 0);
8290 int cpuid = bit_first(cpu_map);
8291 assert(cpuid >= 0);
8292 return processor_array[cpuid];
8293 }
8294
8295 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8296 goto skip_secondaries;
8297 }
8298
8299 non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
8300 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8301 cpu_map = non_realtime_map;
8302 assert(cpu_map != 0);
8303 int cpuid = bit_first(cpu_map);
8304 assert(cpuid >= 0);
8305 return processor_array[cpuid];
8306 }
8307
8308 skip_secondaries:
8309 return PROCESSOR_NULL;
8310 }
8311
8312 /*
8313 * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
8314 * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
8315 *
8316 * pset is locked.
8317 */
8318 static processor_t
choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)8319 choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
8320 {
8321 uint64_t furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon);
8322 processor_t fd_processor = PROCESSOR_NULL;
8323 int lowest_priority = max_pri;
8324
8325 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
8326 if (skip_processor) {
8327 bit_clear(cpu_map, skip_processor->cpu_id);
8328 }
8329 if (skip_spills) {
8330 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8331 }
8332
8333 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8334 processor_t processor = processor_array[cpuid];
8335
8336 if (processor->current_pri > lowest_priority) {
8337 continue;
8338 }
8339
8340 if (processor->current_pri < lowest_priority) {
8341 lowest_priority = processor->current_pri;
8342 furthest_deadline = processor->deadline;
8343 fd_processor = processor;
8344 continue;
8345 }
8346
8347 if (processor->deadline > furthest_deadline) {
8348 furthest_deadline = processor->deadline;
8349 fd_processor = processor;
8350 }
8351 }
8352
8353 if (fd_processor) {
8354 return fd_processor;
8355 }
8356
8357 /*
8358 * There is a race condition possible when there are multiple processor sets.
8359 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
8360 * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
8361 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
8362 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
8363 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
8364 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
8365 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
8366 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
8367 *
8368 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
8369 * on the run queue of that pset.
8370 */
8371 if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
8372 cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
8373 assert(skip_processor == PROCESSOR_NULL);
8374 assert(skip_spills == false);
8375
8376 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8377 processor_t processor = processor_array[cpuid];
8378
8379 if (processor->current_pri > lowest_priority) {
8380 continue;
8381 }
8382
8383 if (processor->current_pri < lowest_priority) {
8384 lowest_priority = processor->current_pri;
8385 furthest_deadline = processor->deadline;
8386 fd_processor = processor;
8387 continue;
8388 }
8389
8390 if (processor->deadline > furthest_deadline) {
8391 furthest_deadline = processor->deadline;
8392 fd_processor = processor;
8393 }
8394 }
8395 }
8396
8397 return fd_processor;
8398 }
8399
8400 /* pset is locked */
8401 static processor_t
choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)8402 choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
8403 {
8404 bool skip_spills = true;
8405 bool include_ast_urgent_pending_cpus = false;
8406
8407 processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
8408 if (next_processor != PROCESSOR_NULL) {
8409 return next_processor;
8410 }
8411
8412 next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
8413 return next_processor;
8414 }
8415
8416 #if defined(__x86_64__)
8417 /* pset is locked */
8418 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)8419 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
8420 {
8421 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8422 int nbackup_cpus = 0;
8423
8424 if (include_backups && rt_runq_is_low_latency(pset)) {
8425 nbackup_cpus = sched_rt_n_backup_processors;
8426 }
8427
8428 cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8429 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8430 bit_clear(cpu_map, 0);
8431 }
8432 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8433 }
8434
8435 /* pset is locked */
8436 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)8437 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
8438 {
8439 int nbackup_cpus = 0;
8440
8441 if (include_backups && rt_runq_is_low_latency(pset)) {
8442 nbackup_cpus = sched_rt_n_backup_processors;
8443 }
8444
8445 cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
8446 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8447 }
8448 #endif
8449
8450 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)8451 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
8452 {
8453 if (!processor->is_recommended) {
8454 return false;
8455 }
8456 bool ok_to_run_realtime_thread = true;
8457 #if defined(__x86_64__)
8458 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
8459 if (spill_pending) {
8460 return true;
8461 }
8462 if (processor->cpu_id == 0) {
8463 if (sched_avoid_cpu0 == 1) {
8464 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
8465 } else if (sched_avoid_cpu0 == 2) {
8466 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
8467 }
8468 } else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
8469 ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
8470 } else if (processor->processor_primary != processor) {
8471 ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
8472 }
8473 #else
8474 (void)pset;
8475 (void)processor;
8476 (void)as_backup;
8477 #endif
8478 return ok_to_run_realtime_thread;
8479 }
8480
8481 void
sched_pset_made_schedulable(__unused processor_t processor,processor_set_t pset,boolean_t drop_lock)8482 sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
8483 {
8484 if (drop_lock) {
8485 pset_unlock(pset);
8486 }
8487 }
8488
8489 void
thread_set_no_smt(bool set)8490 thread_set_no_smt(bool set)
8491 {
8492 if (!system_is_SMT) {
8493 /* Not a machine that supports SMT */
8494 return;
8495 }
8496
8497 thread_t thread = current_thread();
8498
8499 spl_t s = splsched();
8500 thread_lock(thread);
8501 if (set) {
8502 thread->sched_flags |= TH_SFLAG_NO_SMT;
8503 }
8504 thread_unlock(thread);
8505 splx(s);
8506 }
8507
8508 bool
thread_get_no_smt(void)8509 thread_get_no_smt(void)
8510 {
8511 return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8512 }
8513
8514 extern void task_set_no_smt(task_t);
8515 void
task_set_no_smt(task_t task)8516 task_set_no_smt(task_t task)
8517 {
8518 if (!system_is_SMT) {
8519 /* Not a machine that supports SMT */
8520 return;
8521 }
8522
8523 if (task == TASK_NULL) {
8524 task = current_task();
8525 }
8526
8527 task_lock(task);
8528 task->t_flags |= TF_NO_SMT;
8529 task_unlock(task);
8530 }
8531
8532 #if DEBUG || DEVELOPMENT
8533 extern void sysctl_task_set_no_smt(char no_smt);
8534 void
sysctl_task_set_no_smt(char no_smt)8535 sysctl_task_set_no_smt(char no_smt)
8536 {
8537 if (!system_is_SMT) {
8538 /* Not a machine that supports SMT */
8539 return;
8540 }
8541
8542 task_t task = current_task();
8543
8544 task_lock(task);
8545 if (no_smt == '1') {
8546 task->t_flags |= TF_NO_SMT;
8547 }
8548 task_unlock(task);
8549 }
8550
8551 extern char sysctl_task_get_no_smt(void);
8552 char
sysctl_task_get_no_smt(void)8553 sysctl_task_get_no_smt(void)
8554 {
8555 task_t task = current_task();
8556
8557 if (task->t_flags & TF_NO_SMT) {
8558 return '1';
8559 }
8560 return '0';
8561 }
8562 #endif /* DEVELOPMENT || DEBUG */
8563
8564
8565 __private_extern__ void
thread_bind_cluster_type(thread_t thread,char cluster_type,bool soft_bound)8566 thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8567 {
8568 #if __AMP__
8569 spl_t s = splsched();
8570 thread_lock(thread);
8571 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8572 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8573 if (soft_bound) {
8574 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8575 }
8576 switch (cluster_type) {
8577 case 'e':
8578 case 'E':
8579 if (pset0.pset_cluster_type == PSET_AMP_E) {
8580 thread->th_bound_cluster_id = pset0.pset_id;
8581 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8582 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8583 }
8584 break;
8585 case 'p':
8586 case 'P':
8587 if (pset0.pset_cluster_type == PSET_AMP_P) {
8588 thread->th_bound_cluster_id = pset0.pset_id;
8589 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8590 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8591 }
8592 break;
8593 default:
8594 break;
8595 }
8596 thread_unlock(thread);
8597 splx(s);
8598
8599 if (thread == current_thread()) {
8600 thread_block(THREAD_CONTINUE_NULL);
8601 }
8602 #else /* __AMP__ */
8603 (void)thread;
8604 (void)cluster_type;
8605 (void)soft_bound;
8606 #endif /* __AMP__ */
8607 }
8608
8609 extern uint32_t thread_bound_cluster_id(thread_t thread);
8610 uint32_t
thread_bound_cluster_id(thread_t thread)8611 thread_bound_cluster_id(thread_t thread)
8612 {
8613 return thread->th_bound_cluster_id;
8614 }
8615
8616 __private_extern__ kern_return_t
thread_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)8617 thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8618 {
8619 #if __AMP__
8620
8621 processor_set_t pset = NULL;
8622 if (options & (THREAD_BIND_SOFT | THREAD_BIND_ELIGIBLE_ONLY)) {
8623 /* Validate the inputs for the bind case */
8624 int max_clusters = ml_get_cluster_count();
8625 if (cluster_id >= max_clusters) {
8626 /* Invalid cluster id */
8627 return KERN_INVALID_ARGUMENT;
8628 }
8629 pset = pset_array[cluster_id];
8630 if (pset == NULL) {
8631 /* Cluster has not been initialized yet */
8632 return KERN_INVALID_ARGUMENT;
8633 }
8634 if (options & THREAD_BIND_ELIGIBLE_ONLY) {
8635 if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
8636 /* Thread is not recommended for the cluster type */
8637 return KERN_INVALID_POLICY;
8638 }
8639 }
8640 }
8641
8642 if (options & THREAD_UNBIND) {
8643 /* If the thread was actually not bound to some cluster, nothing to do here */
8644 if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8645 return KERN_SUCCESS;
8646 }
8647 }
8648
8649 spl_t s = splsched();
8650 thread_lock(thread);
8651
8652 /* Unbind the thread from its previous bound state */
8653 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8654 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8655
8656 if (options & THREAD_UNBIND) {
8657 /* Nothing more to do here */
8658 goto thread_bind_cluster_complete;
8659 }
8660
8661 if (options & THREAD_BIND_SOFT) {
8662 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8663 }
8664 thread->th_bound_cluster_id = cluster_id;
8665
8666 thread_bind_cluster_complete:
8667 thread_unlock(thread);
8668 splx(s);
8669
8670 if (thread == current_thread()) {
8671 thread_block(THREAD_CONTINUE_NULL);
8672 }
8673 #else /* __AMP__ */
8674 (void)thread;
8675 (void)cluster_id;
8676 (void)options;
8677 #endif /* __AMP__ */
8678 return KERN_SUCCESS;
8679 }
8680
8681 #if DEVELOPMENT || DEBUG
8682 extern int32_t sysctl_get_bound_cpuid(void);
8683 int32_t
sysctl_get_bound_cpuid(void)8684 sysctl_get_bound_cpuid(void)
8685 {
8686 int32_t cpuid = -1;
8687 thread_t self = current_thread();
8688
8689 processor_t processor = self->bound_processor;
8690 if (processor == NULL) {
8691 cpuid = -1;
8692 } else {
8693 cpuid = processor->cpu_id;
8694 }
8695
8696 return cpuid;
8697 }
8698
8699 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
8700 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)8701 sysctl_thread_bind_cpuid(int32_t cpuid)
8702 {
8703 processor_t processor = PROCESSOR_NULL;
8704
8705 if (cpuid == -1) {
8706 goto unbind;
8707 }
8708
8709 if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
8710 return KERN_INVALID_VALUE;
8711 }
8712
8713 processor = processor_array[cpuid];
8714 if (processor == PROCESSOR_NULL) {
8715 return KERN_INVALID_VALUE;
8716 }
8717
8718 #if __AMP__
8719
8720 thread_t thread = current_thread();
8721
8722 if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
8723 if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
8724 /* Cannot hard-bind an already hard-cluster-bound thread */
8725 return KERN_NOT_SUPPORTED;
8726 }
8727 }
8728
8729 #endif /* __AMP__ */
8730
8731 unbind:
8732 thread_bind(processor);
8733
8734 thread_block(THREAD_CONTINUE_NULL);
8735 return KERN_SUCCESS;
8736 }
8737
8738 extern char sysctl_get_task_cluster_type(void);
8739 char
sysctl_get_task_cluster_type(void)8740 sysctl_get_task_cluster_type(void)
8741 {
8742 task_t task = current_task();
8743 processor_set_t pset_hint = task->pset_hint;
8744
8745 if (!pset_hint) {
8746 return '0';
8747 }
8748
8749 #if __AMP__
8750 if (pset_hint->pset_cluster_type == PSET_AMP_E) {
8751 return 'E';
8752 } else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
8753 return 'P';
8754 }
8755 #endif
8756
8757 return '0';
8758 }
8759
8760 #if __AMP__
8761 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)8762 find_pset_of_type(pset_cluster_type_t t)
8763 {
8764 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8765 if (node->pset_cluster_type != t) {
8766 continue;
8767 }
8768
8769 processor_set_t pset = PROCESSOR_SET_NULL;
8770 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8771 pset = pset_array[pset_id];
8772 /* Prefer one with recommended processsors */
8773 if (pset->recommended_bitmask != 0) {
8774 assert(pset->pset_cluster_type == t);
8775 return pset;
8776 }
8777 }
8778 /* Otherwise return whatever was found last */
8779 return pset;
8780 }
8781
8782 return PROCESSOR_SET_NULL;
8783 }
8784 #endif
8785
8786 extern void sysctl_task_set_cluster_type(char cluster_type);
8787 void
sysctl_task_set_cluster_type(char cluster_type)8788 sysctl_task_set_cluster_type(char cluster_type)
8789 {
8790 task_t task = current_task();
8791 processor_set_t pset_hint = PROCESSOR_SET_NULL;
8792
8793 #if __AMP__
8794 switch (cluster_type) {
8795 case 'e':
8796 case 'E':
8797 pset_hint = find_pset_of_type(PSET_AMP_E);
8798 break;
8799 case 'p':
8800 case 'P':
8801 pset_hint = find_pset_of_type(PSET_AMP_P);
8802 break;
8803 default:
8804 break;
8805 }
8806
8807 if (pset_hint) {
8808 task_lock(task);
8809 task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
8810 task->pset_hint = pset_hint;
8811 task_unlock(task);
8812
8813 thread_block(THREAD_CONTINUE_NULL);
8814 }
8815 #else
8816 (void)cluster_type;
8817 (void)task;
8818 (void)pset_hint;
8819 #endif
8820 }
8821
8822 /*
8823 * The quantum length used for Fixed and RT sched modes. In general the quantum
8824 * can vary - for example for background or QOS.
8825 */
8826 extern uint64_t sysctl_get_quantum_us(void);
8827 uint64_t
sysctl_get_quantum_us(void)8828 sysctl_get_quantum_us(void)
8829 {
8830 uint32_t quantum;
8831 uint64_t quantum_ns;
8832
8833 quantum = SCHED(initial_quantum_size)(THREAD_NULL);
8834 absolutetime_to_nanoseconds(quantum, &quantum_ns);
8835
8836 return quantum_ns / 1000;
8837 }
8838
8839 #endif /* DEVELOPMENT || DEBUG */
8840