1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_FREE_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67 #include <debug.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80
81 #include <machine/commpage.h>
82
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/debug.h>
90 #include <kern/macro_help.h>
91 #include <kern/machine.h>
92 #include <kern/misc_protos.h>
93 #if MONOTONIC
94 #include <kern/monotonic.h>
95 #endif /* MONOTONIC */
96 #include <kern/processor.h>
97 #include <kern/queue.h>
98 #include <kern/sched.h>
99 #include <kern/sched_prim.h>
100 #include <kern/sfi.h>
101 #include <kern/syscall_subr.h>
102 #include <kern/task.h>
103 #include <kern/thread.h>
104 #include <kern/thread_group.h>
105 #include <kern/ledger.h>
106 #include <kern/timer_queue.h>
107 #include <kern/waitq.h>
108 #include <kern/policy_internal.h>
109 #include <kern/cpu_quiesce.h>
110
111 #include <vm/pmap.h>
112 #include <vm/vm_kern.h>
113 #include <vm/vm_map.h>
114 #include <vm/vm_pageout.h>
115
116 #include <mach/sdt.h>
117 #include <mach/mach_host.h>
118 #include <mach/host_info.h>
119
120 #include <sys/kdebug.h>
121 #include <kperf/kperf.h>
122 #include <kern/kpc.h>
123 #include <san/kasan.h>
124 #include <kern/pms.h>
125 #include <kern/host.h>
126 #include <stdatomic.h>
127
128 struct ast_gen_pair {
129 os_atomic(ast_gen_t) ast_gen;
130 os_atomic(ast_gen_t) ast_ack;
131 };
132
133 static struct ast_gen_pair PERCPU_DATA(ast_gen_pair);
134 struct sched_statistics PERCPU_DATA(sched_stats);
135 bool sched_stats_active;
136
137 #define AST_GEN_CMP(a, op, b) ((long)((a) - (b)) op 0)
138
139 __startup_func
140 static void
ast_gen_init(void)141 ast_gen_init(void)
142 {
143 percpu_foreach(pair, ast_gen_pair) {
144 os_atomic_init(&pair->ast_gen, 1);
145 os_atomic_init(&pair->ast_ack, 1);
146 }
147 }
148 STARTUP(PERCPU, STARTUP_RANK_MIDDLE, ast_gen_init);
149
150 static uint64_t
deadline_add(uint64_t d,uint64_t e)151 deadline_add(uint64_t d, uint64_t e)
152 {
153 uint64_t sum;
154 return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
155 }
156
157 int
rt_runq_count(processor_set_t pset)158 rt_runq_count(processor_set_t pset)
159 {
160 return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
161 }
162
163 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)164 rt_runq_earliest_deadline(processor_set_t pset)
165 {
166 return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
167 }
168
169 static int
rt_runq_priority(processor_set_t pset)170 rt_runq_priority(processor_set_t pset)
171 {
172 pset_assert_locked(pset);
173 rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
174
175 bitmap_t *map = rt_run_queue->bitmap;
176 int i = bitmap_first(map, NRTQS);
177 assert(i < NRTQS);
178
179 if (i >= 0) {
180 return i + BASEPRI_RTQUEUES;
181 }
182
183 return i;
184 }
185
186 static thread_t rt_runq_first(rt_queue_t rt_runq);
187
188 #if DEBUG
189 static void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)190 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
191 {
192 bitmap_t *map = rt_run_queue->bitmap;
193
194 uint64_t earliest_deadline = RT_DEADLINE_NONE;
195 uint32_t constraint = RT_CONSTRAINT_NONE;
196 int ed_index = NOPRI;
197 int count = 0;
198 bool found_thread = false;
199
200 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
201 int i = pri - BASEPRI_RTQUEUES;
202 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
203 queue_t queue = &rt_runq->pri_queue;
204 queue_entry_t iter;
205 int n = 0;
206 uint64_t previous_deadline = 0;
207 qe_foreach(iter, queue) {
208 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
209 assert_thread_magic(iter_thread);
210 if (iter_thread == thread) {
211 found_thread = true;
212 }
213 assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
214 assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
215 assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
216 assert(previous_deadline <= iter_thread->realtime.deadline);
217 n++;
218 if (iter == queue_first(queue)) {
219 assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
220 assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
221 }
222 previous_deadline = iter_thread->realtime.deadline;
223 }
224 assert(n == rt_runq->pri_count);
225 if (n == 0) {
226 assert(bitmap_test(map, i) == false);
227 assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
228 assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
229 } else {
230 assert(bitmap_test(map, i) == true);
231 }
232 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
233 earliest_deadline = rt_runq->pri_earliest_deadline;
234 constraint = rt_runq->pri_constraint;
235 ed_index = i;
236 }
237 count += n;
238 }
239 assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
240 assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
241 assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
242 assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
243 if (thread) {
244 assert(found_thread);
245 }
246 }
247 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th)
248 #else
249 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0)
250 #endif
251
252 uint32_t rt_constraint_threshold;
253
254 static bool
rt_runq_is_low_latency(processor_set_t pset)255 rt_runq_is_low_latency(processor_set_t pset)
256 {
257 return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
258 }
259
260 #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
261 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
262
263 #define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
264 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
265
266 #define MAX_UNSAFE_QUANTA 800
267 TUNABLE(int, max_unsafe_quanta, "unsafe", MAX_UNSAFE_QUANTA);
268
269 #define MAX_POLL_QUANTA 2
270 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
271
272 #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
273 int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
274
275 uint64_t max_poll_computation;
276
277 uint64_t max_unsafe_computation;
278 uint64_t sched_safe_duration;
279
280 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
281
282 uint32_t std_quantum;
283 uint32_t min_std_quantum;
284 uint32_t bg_quantum;
285
286 uint32_t std_quantum_us;
287 uint32_t bg_quantum_us;
288
289 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
290
291 uint32_t thread_depress_time;
292 uint32_t default_timeshare_computation;
293 uint32_t default_timeshare_constraint;
294
295 uint32_t max_rt_quantum;
296 uint32_t min_rt_quantum;
297
298 uint32_t rt_deadline_epsilon;
299
300 uint32_t rt_constraint_threshold;
301 uint32_t rt_constraint_ll;
302
303 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
304
305 unsigned sched_tick;
306 uint32_t sched_tick_interval;
307
308 /* Timeshare load calculation interval (15ms) */
309 uint32_t sched_load_compute_interval_us = 15000;
310 uint64_t sched_load_compute_interval_abs;
311 static _Atomic uint64_t sched_load_compute_deadline;
312
313 uint32_t sched_pri_shifts[TH_BUCKET_MAX];
314 uint32_t sched_fixed_shift;
315
316 uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
317
318 /* Allow foreground to decay past default to resolve inversions */
319 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
320 int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
321
322 /* Defaults for timer deadline profiling */
323 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
324 * 2ms */
325 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
326 * <= 5ms */
327
328 uint64_t timer_deadline_tracking_bin_1;
329 uint64_t timer_deadline_tracking_bin_2;
330
331 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
332
333 thread_t sched_maintenance_thread;
334
335 /* interrupts disabled lock to guard recommended cores state */
336 decl_simple_lock_data(static, sched_recommended_cores_lock);
337 static uint64_t usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
338 static void sched_update_recommended_cores(uint64_t recommended_cores);
339
340 #if __arm__ || __arm64__
341 static void sched_recommended_cores_maintenance(void);
342 uint64_t perfcontrol_failsafe_starvation_threshold;
343 extern char *proc_name_address(struct proc *p);
344 #endif /* __arm__ || __arm64__ */
345
346 uint64_t sched_one_second_interval;
347 boolean_t allow_direct_handoff = TRUE;
348
349 /* Forwards */
350
351 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
352
353 static void load_shift_init(void);
354 static void preempt_pri_init(void);
355
356 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
357
358 thread_t processor_idle(
359 thread_t thread,
360 processor_t processor);
361
362 static ast_t
363 csw_check_locked(
364 thread_t thread,
365 processor_t processor,
366 processor_set_t pset,
367 ast_t check_reason);
368
369 static void processor_setrun(
370 processor_t processor,
371 thread_t thread,
372 integer_t options);
373
374 static void
375 sched_realtime_timebase_init(void);
376
377 static void
378 sched_timer_deadline_tracking_init(void);
379
380 #if DEBUG
381 extern int debug_task;
382 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
383 #else
384 #define TLOG(a, fmt, args...) do {} while (0)
385 #endif
386
387 static processor_t
388 thread_bind_internal(
389 thread_t thread,
390 processor_t processor);
391
392 static void
393 sched_vm_group_maintenance(void);
394
395 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
396 int8_t sched_load_shifts[NRQS];
397 bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
398 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
399
400 /*
401 * Statically allocate a buffer to hold the longest possible
402 * scheduler description string, as currently implemented.
403 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
404 * to export to userspace via sysctl(3). If either version
405 * changes, update the other.
406 *
407 * Note that in addition to being an upper bound on the strings
408 * in the kernel, it's also an exact parameter to PE_get_default(),
409 * which interrogates the device tree on some platforms. That
410 * API requires the caller know the exact size of the device tree
411 * property, so we need both a legacy size (32) and the current size
412 * (48) to deal with old and new device trees. The device tree property
413 * is similarly padded to a fixed size so that the same kernel image
414 * can run on multiple devices with different schedulers configured
415 * in the device tree.
416 */
417 char sched_string[SCHED_STRING_MAX_LENGTH];
418
419 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
420
421 /* Global flag which indicates whether Background Stepper Context is enabled */
422 static int cpu_throttle_enabled = 1;
423
424 #if DEVELOPMENT || DEBUG
425 int enable_task_set_cluster_type = 0;
426 bool system_ecore_only = false;
427 #endif /* DEVELOPMENT || DEBUG */
428
429 void
sched_init(void)430 sched_init(void)
431 {
432 boolean_t direct_handoff = FALSE;
433 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
434
435 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
436 /* No boot-args, check in device tree */
437 if (!PE_get_default("kern.sched_pri_decay_limit",
438 &sched_pri_decay_band_limit,
439 sizeof(sched_pri_decay_band_limit))) {
440 /* Allow decay all the way to normal limits */
441 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
442 }
443 }
444
445 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
446
447 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
448 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
449 }
450 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
451
452 cpu_quiescent_counter_init();
453
454 SCHED(init)();
455 SCHED(rt_init)(&pset0);
456 sched_timer_deadline_tracking_init();
457
458 SCHED(pset_init)(&pset0);
459 SCHED(processor_init)(master_processor);
460
461 if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
462 allow_direct_handoff = direct_handoff;
463 }
464
465 #if DEVELOPMENT || DEBUG
466 if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
467 system_ecore_only = (enable_task_set_cluster_type == 2);
468 }
469 #endif /* DEVELOPMENT || DEBUG */
470 }
471
472 void
sched_timebase_init(void)473 sched_timebase_init(void)
474 {
475 uint64_t abstime;
476
477 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
478 sched_one_second_interval = abstime;
479
480 SCHED(timebase_init)();
481 sched_realtime_timebase_init();
482 }
483
484 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
485
486 void
sched_timeshare_init(void)487 sched_timeshare_init(void)
488 {
489 /*
490 * Calculate the timeslicing quantum
491 * in us.
492 */
493 if (default_preemption_rate < 1) {
494 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
495 }
496 std_quantum_us = (1000 * 1000) / default_preemption_rate;
497
498 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
499
500 if (default_bg_preemption_rate < 1) {
501 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
502 }
503 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
504
505 printf("standard background quantum is %d us\n", bg_quantum_us);
506
507 load_shift_init();
508 preempt_pri_init();
509 sched_tick = 0;
510 }
511
512 void
sched_timeshare_timebase_init(void)513 sched_timeshare_timebase_init(void)
514 {
515 uint64_t abstime;
516 uint32_t shift;
517
518 /* standard timeslicing quantum */
519 clock_interval_to_absolutetime_interval(
520 std_quantum_us, NSEC_PER_USEC, &abstime);
521 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
522 std_quantum = (uint32_t)abstime;
523
524 /* smallest remaining quantum (250 us) */
525 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
526 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
527 min_std_quantum = (uint32_t)abstime;
528
529 /* quantum for background tasks */
530 clock_interval_to_absolutetime_interval(
531 bg_quantum_us, NSEC_PER_USEC, &abstime);
532 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
533 bg_quantum = (uint32_t)abstime;
534
535 /* scheduler tick interval */
536 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
537 NSEC_PER_USEC, &abstime);
538 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
539 sched_tick_interval = (uint32_t)abstime;
540
541 /* timeshare load calculation interval & deadline initialization */
542 clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
543 os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
544
545 /*
546 * Compute conversion factor from usage to
547 * timesharing priorities with 5/8 ** n aging.
548 */
549 abstime = (abstime * 5) / 3;
550 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
551 abstime >>= 1;
552 }
553 sched_fixed_shift = shift;
554
555 for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
556 sched_pri_shifts[i] = INT8_MAX;
557 }
558
559 max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
560 sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
561
562 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
563 thread_depress_time = 1 * std_quantum;
564 default_timeshare_computation = std_quantum / 2;
565 default_timeshare_constraint = std_quantum;
566
567 #if __arm__ || __arm64__
568 perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
569 #endif /* __arm__ || __arm64__ */
570 }
571
572 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
573
574 void
pset_rt_init(processor_set_t pset)575 pset_rt_init(processor_set_t pset)
576 {
577 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
578 int i = pri - BASEPRI_RTQUEUES;
579 rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
580 queue_init(&rqi->pri_queue);
581 rqi->pri_count = 0;
582 rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
583 rqi->pri_constraint = RT_CONSTRAINT_NONE;
584 }
585 os_atomic_init(&pset->rt_runq.count, 0);
586 os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
587 os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
588 os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
589 memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
590 }
591
592 /* constraint limit for low latency RT threads */
593 int rt_constraint_ll_us = 0;
594
595 int
sched_get_rt_constraint_ll(void)596 sched_get_rt_constraint_ll(void)
597 {
598 return rt_constraint_ll_us;
599 }
600
601 void
sched_set_rt_constraint_ll(int new_constraint_us)602 sched_set_rt_constraint_ll(int new_constraint_us)
603 {
604 rt_constraint_ll_us = new_constraint_us;
605
606 uint64_t abstime;
607 clock_interval_to_absolutetime_interval(rt_constraint_ll_us, NSEC_PER_USEC, &abstime);
608 assert((abstime >> 32) == 0 && ((rt_constraint_ll_us == 0) || (uint32_t)abstime != 0));
609 rt_constraint_ll = (uint32_t)abstime;
610 }
611
612 /* epsilon for comparing RT deadlines */
613 int rt_deadline_epsilon_us = 100;
614
615 int
sched_get_rt_deadline_epsilon(void)616 sched_get_rt_deadline_epsilon(void)
617 {
618 return rt_deadline_epsilon_us;
619 }
620
621 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)622 sched_set_rt_deadline_epsilon(int new_epsilon_us)
623 {
624 rt_deadline_epsilon_us = new_epsilon_us;
625
626 uint64_t abstime;
627 clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
628 assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
629 rt_deadline_epsilon = (uint32_t)abstime;
630 }
631
632 static void
sched_realtime_timebase_init(void)633 sched_realtime_timebase_init(void)
634 {
635 uint64_t abstime;
636
637 /* smallest rt computation (50 us) */
638 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
639 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
640 min_rt_quantum = (uint32_t)abstime;
641
642 /* maximum rt computation (50 ms) */
643 clock_interval_to_absolutetime_interval(
644 50, 1000 * NSEC_PER_USEC, &abstime);
645 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
646 max_rt_quantum = (uint32_t)abstime;
647
648 /* constraint threshold for sending backup IPIs (4 ms) */
649 clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
650 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
651 rt_constraint_threshold = (uint32_t)abstime;
652
653 /* constraint limit for low latency RT threads */
654 sched_set_rt_constraint_ll(rt_constraint_ll_us);
655
656 /* epsilon for comparing deadlines */
657 sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
658 }
659
660 void
sched_check_spill(processor_set_t pset,thread_t thread)661 sched_check_spill(processor_set_t pset, thread_t thread)
662 {
663 (void)pset;
664 (void)thread;
665
666 return;
667 }
668
669 bool
sched_thread_should_yield(processor_t processor,thread_t thread)670 sched_thread_should_yield(processor_t processor, thread_t thread)
671 {
672 (void)thread;
673
674 return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
675 }
676
677 /* Default implementations of .steal_thread_enabled */
678 bool
sched_steal_thread_DISABLED(processor_set_t pset)679 sched_steal_thread_DISABLED(processor_set_t pset)
680 {
681 (void)pset;
682 return false;
683 }
684
685 bool
sched_steal_thread_enabled(processor_set_t pset)686 sched_steal_thread_enabled(processor_set_t pset)
687 {
688 return bit_count(pset->node->pset_map) > 1;
689 }
690
691 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
692
693 /*
694 * Set up values for timeshare
695 * loading factors.
696 */
697 static void
load_shift_init(void)698 load_shift_init(void)
699 {
700 int8_t k, *p = sched_load_shifts;
701 uint32_t i, j;
702
703 uint32_t sched_decay_penalty = 1;
704
705 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
706 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
707 }
708
709 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
710 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
711 }
712
713 if (sched_decay_penalty == 0) {
714 /*
715 * There is no penalty for timeshare threads for using too much
716 * CPU, so set all load shifts to INT8_MIN. Even under high load,
717 * sched_pri_shift will be >INT8_MAX, and there will be no
718 * penalty applied to threads (nor will sched_usage be updated per
719 * thread).
720 */
721 for (i = 0; i < NRQS; i++) {
722 sched_load_shifts[i] = INT8_MIN;
723 }
724
725 return;
726 }
727
728 *p++ = INT8_MIN; *p++ = 0;
729
730 /*
731 * For a given system load "i", the per-thread priority
732 * penalty per quantum of CPU usage is ~2^k priority
733 * levels. "sched_decay_penalty" can cause more
734 * array entries to be filled with smaller "k" values
735 */
736 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
737 for (j <<= 1; (i < j) && (i < NRQS); ++i) {
738 *p++ = k;
739 }
740 }
741 }
742
743 static void
preempt_pri_init(void)744 preempt_pri_init(void)
745 {
746 bitmap_t *p = sched_preempt_pri;
747
748 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
749 bitmap_set(p, i);
750 }
751
752 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
753 bitmap_set(p, i);
754 }
755 }
756
757 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
758
759 void
check_monotonic_time(uint64_t ctime)760 check_monotonic_time(uint64_t ctime)
761 {
762 processor_t processor = current_processor();
763 uint64_t last_dispatch = processor->last_dispatch;
764
765 if (last_dispatch > ctime) {
766 panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
767 last_dispatch, ctime);
768 }
769 }
770
771
772 /*
773 * Thread wait timer expiration.
774 */
775 void
thread_timer_expire(void * p0,__unused void * p1)776 thread_timer_expire(
777 void *p0,
778 __unused void *p1)
779 {
780 thread_t thread = p0;
781 spl_t s;
782
783 assert_thread_magic(thread);
784
785 s = splsched();
786 thread_lock(thread);
787 if (--thread->wait_timer_active == 0) {
788 if (thread->wait_timer_is_set) {
789 thread->wait_timer_is_set = FALSE;
790 clear_wait_internal(thread, THREAD_TIMED_OUT);
791 }
792 }
793 thread_unlock(thread);
794 splx(s);
795 }
796
797 /*
798 * thread_unblock:
799 *
800 * Unblock thread on wake up.
801 *
802 * Returns TRUE if the thread should now be placed on the runqueue.
803 *
804 * Thread must be locked.
805 *
806 * Called at splsched().
807 */
808 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)809 thread_unblock(
810 thread_t thread,
811 wait_result_t wresult)
812 {
813 boolean_t ready_for_runq = FALSE;
814 thread_t cthread = current_thread();
815 uint32_t new_run_count;
816 int old_thread_state;
817
818 /*
819 * Set wait_result.
820 */
821 thread->wait_result = wresult;
822
823 /*
824 * Cancel pending wait timer.
825 */
826 if (thread->wait_timer_is_set) {
827 if (timer_call_cancel(thread->wait_timer)) {
828 thread->wait_timer_active--;
829 }
830 thread->wait_timer_is_set = FALSE;
831 }
832
833 boolean_t aticontext, pidle;
834 ml_get_power_state(&aticontext, &pidle);
835
836 /*
837 * Update scheduling state: not waiting,
838 * set running.
839 */
840 old_thread_state = thread->state;
841 thread->state = (old_thread_state | TH_RUN) &
842 ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT);
843
844 if ((old_thread_state & TH_RUN) == 0) {
845 uint64_t ctime = mach_approximate_time();
846
847 check_monotonic_time(ctime);
848
849 thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
850 timer_start(&thread->runnable_timer, ctime);
851
852 ready_for_runq = TRUE;
853
854 if (old_thread_state & TH_WAIT_REPORT) {
855 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
856 }
857
858 /* Update the runnable thread count */
859 new_run_count = SCHED(run_count_incr)(thread);
860
861 #if CONFIG_SCHED_AUTO_JOIN
862 if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
863 work_interval_auto_join_propagate(cthread, thread);
864 }
865 #endif /*CONFIG_SCHED_AUTO_JOIN */
866 } else {
867 /*
868 * Either the thread is idling in place on another processor,
869 * or it hasn't finished context switching yet.
870 */
871 assert((thread->state & TH_IDLE) == 0);
872 /*
873 * The run count is only dropped after the context switch completes
874 * and the thread is still waiting, so we should not run_incr here
875 */
876 new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
877 }
878
879 /*
880 * Calculate deadline for real-time threads.
881 */
882 if (thread->sched_mode == TH_MODE_REALTIME) {
883 uint64_t ctime = mach_absolute_time();
884 thread->realtime.deadline = thread->realtime.constraint + ctime;
885 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
886 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
887 }
888
889 /*
890 * Clear old quantum, fail-safe computation, etc.
891 */
892 thread->quantum_remaining = 0;
893 thread->computation_metered = 0;
894 thread->reason = AST_NONE;
895 thread->block_hint = kThreadWaitNone;
896
897 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
898 * We also account for "double hop" thread signaling via
899 * the thread callout infrastructure.
900 * DRK: consider removing the callout wakeup counters in the future
901 * they're present for verification at the moment.
902 */
903
904 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
905 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
906
907 uint64_t ttd = current_processor()->timer_call_ttd;
908
909 if (ttd) {
910 if (ttd <= timer_deadline_tracking_bin_1) {
911 thread->thread_timer_wakeups_bin_1++;
912 } else if (ttd <= timer_deadline_tracking_bin_2) {
913 thread->thread_timer_wakeups_bin_2++;
914 }
915 }
916
917 ledger_credit_thread(thread, thread->t_ledger,
918 task_ledgers.interrupt_wakeups, 1);
919 if (pidle) {
920 ledger_credit_thread(thread, thread->t_ledger,
921 task_ledgers.platform_idle_wakeups, 1);
922 }
923 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
924 /* TODO: what about an interrupt that does a wake taken on a callout thread? */
925 if (cthread->callout_woken_from_icontext) {
926 ledger_credit_thread(thread, thread->t_ledger,
927 task_ledgers.interrupt_wakeups, 1);
928 thread->thread_callout_interrupt_wakeups++;
929
930 if (cthread->callout_woken_from_platform_idle) {
931 ledger_credit_thread(thread, thread->t_ledger,
932 task_ledgers.platform_idle_wakeups, 1);
933 thread->thread_callout_platform_idle_wakeups++;
934 }
935
936 cthread->callout_woke_thread = TRUE;
937 }
938 }
939
940 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
941 thread->callout_woken_from_icontext = !!aticontext;
942 thread->callout_woken_from_platform_idle = !!pidle;
943 thread->callout_woke_thread = FALSE;
944 }
945
946 #if KPERF
947 if (ready_for_runq) {
948 kperf_make_runnable(thread, aticontext);
949 }
950 #endif /* KPERF */
951
952 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
953 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
954 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
955 sched_run_buckets[TH_BUCKET_RUN], 0);
956
957 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
958
959 return ready_for_runq;
960 }
961
962 /*
963 * Routine: thread_allowed_for_handoff
964 * Purpose:
965 * Check if the thread is allowed for handoff operation
966 * Conditions:
967 * thread lock held, IPC locks may be held.
968 * TODO: In future, do not allow handoff if threads have different cluster
969 * recommendations.
970 */
971 boolean_t
thread_allowed_for_handoff(thread_t thread)972 thread_allowed_for_handoff(
973 thread_t thread)
974 {
975 thread_t self = current_thread();
976
977 if (allow_direct_handoff &&
978 thread->sched_mode == TH_MODE_REALTIME &&
979 self->sched_mode == TH_MODE_REALTIME) {
980 return TRUE;
981 }
982
983 return FALSE;
984 }
985
986 /*
987 * Routine: thread_go
988 * Purpose:
989 * Unblock and dispatch thread.
990 * Conditions:
991 * thread lock held, IPC locks may be held.
992 * thread must have been pulled from wait queue under same lock hold.
993 * thread must have been waiting
994 * Returns:
995 * KERN_SUCCESS - Thread was set running
996 *
997 * TODO: This should return void
998 */
999 kern_return_t
thread_go(thread_t thread,wait_result_t wresult,waitq_options_t option)1000 thread_go(
1001 thread_t thread,
1002 wait_result_t wresult,
1003 waitq_options_t option)
1004 {
1005 thread_t self = current_thread();
1006
1007 assert_thread_magic(thread);
1008
1009 assert(thread->at_safe_point == FALSE);
1010 assert(thread->wait_event == NO_EVENT64);
1011 assert(thread->waitq == NULL);
1012
1013 assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1014 assert(thread->state & TH_WAIT);
1015
1016
1017 if (thread_unblock(thread, wresult)) {
1018 #if SCHED_TRACE_THREAD_WAKEUPS
1019 backtrace(&thread->thread_wakeup_bt[0],
1020 (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1021 NULL);
1022 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1023 if ((option & WQ_OPTION_HANDOFF) &&
1024 thread_allowed_for_handoff(thread)) {
1025 thread_reference(thread);
1026 assert(self->handoff_thread == NULL);
1027 self->handoff_thread = thread;
1028 } else {
1029 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1030 }
1031 }
1032
1033 return KERN_SUCCESS;
1034 }
1035
1036 /*
1037 * Routine: thread_mark_wait_locked
1038 * Purpose:
1039 * Mark a thread as waiting. If, given the circumstances,
1040 * it doesn't want to wait (i.e. already aborted), then
1041 * indicate that in the return value.
1042 * Conditions:
1043 * at splsched() and thread is locked.
1044 */
1045 __private_extern__
1046 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1047 thread_mark_wait_locked(
1048 thread_t thread,
1049 wait_interrupt_t interruptible_orig)
1050 {
1051 boolean_t at_safe_point;
1052 wait_interrupt_t interruptible = interruptible_orig;
1053
1054 if (thread->state & TH_IDLE) {
1055 panic("Invalid attempt to wait while running the idle thread");
1056 }
1057
1058 assert(!(thread->state & (TH_WAIT | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1059
1060 /*
1061 * The thread may have certain types of interrupts/aborts masked
1062 * off. Even if the wait location says these types of interrupts
1063 * are OK, we have to honor mask settings (outer-scoped code may
1064 * not be able to handle aborts at the moment).
1065 */
1066 interruptible &= TH_OPT_INTMASK;
1067 if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1068 interruptible = thread->options & TH_OPT_INTMASK;
1069 }
1070
1071 at_safe_point = (interruptible == THREAD_ABORTSAFE);
1072
1073 if (interruptible == THREAD_UNINT ||
1074 !(thread->sched_flags & TH_SFLAG_ABORT) ||
1075 (!at_safe_point &&
1076 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1077 if (!(thread->state & TH_TERMINATE)) {
1078 DTRACE_SCHED(sleep);
1079 }
1080
1081 int state_bits = TH_WAIT;
1082 if (!interruptible) {
1083 state_bits |= TH_UNINT;
1084 }
1085 if (thread->sched_call) {
1086 wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1087 if (is_kerneltask(get_threadtask(thread))) {
1088 mask = THREAD_WAIT_NOREPORT_KERNEL;
1089 }
1090 if ((interruptible_orig & mask) == 0) {
1091 state_bits |= TH_WAIT_REPORT;
1092 }
1093 }
1094 thread->state |= state_bits;
1095 thread->at_safe_point = at_safe_point;
1096
1097 /* TODO: pass this through assert_wait instead, have
1098 * assert_wait just take a struct as an argument */
1099 assert(!thread->block_hint);
1100 thread->block_hint = thread->pending_block_hint;
1101 thread->pending_block_hint = kThreadWaitNone;
1102
1103 return thread->wait_result = THREAD_WAITING;
1104 } else {
1105 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1106 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1107 }
1108 }
1109 thread->pending_block_hint = kThreadWaitNone;
1110
1111 return thread->wait_result = THREAD_INTERRUPTED;
1112 }
1113
1114 /*
1115 * Routine: thread_interrupt_level
1116 * Purpose:
1117 * Set the maximum interruptible state for the
1118 * current thread. The effective value of any
1119 * interruptible flag passed into assert_wait
1120 * will never exceed this.
1121 *
1122 * Useful for code that must not be interrupted,
1123 * but which calls code that doesn't know that.
1124 * Returns:
1125 * The old interrupt level for the thread.
1126 */
1127 __private_extern__
1128 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1129 thread_interrupt_level(
1130 wait_interrupt_t new_level)
1131 {
1132 thread_t thread = current_thread();
1133 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1134
1135 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1136
1137 return result;
1138 }
1139
1140 /*
1141 * assert_wait:
1142 *
1143 * Assert that the current thread is about to go to
1144 * sleep until the specified event occurs.
1145 */
1146 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1147 assert_wait(
1148 event_t event,
1149 wait_interrupt_t interruptible)
1150 {
1151 if (__improbable(event == NO_EVENT)) {
1152 panic("%s() called with NO_EVENT", __func__);
1153 }
1154
1155 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1156 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1157 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1158
1159 struct waitq *waitq;
1160 waitq = global_eventq(event);
1161 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1162 }
1163
1164 /*
1165 * assert_wait_queue:
1166 *
1167 * Return the global waitq for the specified event
1168 */
1169 struct waitq *
assert_wait_queue(event_t event)1170 assert_wait_queue(
1171 event_t event)
1172 {
1173 return global_eventq(event);
1174 }
1175
1176 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1177 assert_wait_timeout(
1178 event_t event,
1179 wait_interrupt_t interruptible,
1180 uint32_t interval,
1181 uint32_t scale_factor)
1182 {
1183 thread_t thread = current_thread();
1184 wait_result_t wresult;
1185 uint64_t deadline;
1186 spl_t s;
1187
1188 if (__improbable(event == NO_EVENT)) {
1189 panic("%s() called with NO_EVENT", __func__);
1190 }
1191
1192 struct waitq *waitq;
1193 waitq = global_eventq(event);
1194
1195 s = splsched();
1196 waitq_lock(waitq);
1197
1198 clock_interval_to_deadline(interval, scale_factor, &deadline);
1199
1200 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1201 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1202 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1203
1204 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1205 interruptible,
1206 TIMEOUT_URGENCY_SYS_NORMAL,
1207 deadline, TIMEOUT_NO_LEEWAY,
1208 thread);
1209
1210 waitq_unlock(waitq);
1211 splx(s);
1212 return wresult;
1213 }
1214
1215 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1216 assert_wait_timeout_with_leeway(
1217 event_t event,
1218 wait_interrupt_t interruptible,
1219 wait_timeout_urgency_t urgency,
1220 uint32_t interval,
1221 uint32_t leeway,
1222 uint32_t scale_factor)
1223 {
1224 thread_t thread = current_thread();
1225 wait_result_t wresult;
1226 uint64_t deadline;
1227 uint64_t abstime;
1228 uint64_t slop;
1229 uint64_t now;
1230 spl_t s;
1231
1232 if (__improbable(event == NO_EVENT)) {
1233 panic("%s() called with NO_EVENT", __func__);
1234 }
1235
1236 now = mach_absolute_time();
1237 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1238 deadline = now + abstime;
1239
1240 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1241
1242 struct waitq *waitq;
1243 waitq = global_eventq(event);
1244
1245 s = splsched();
1246 waitq_lock(waitq);
1247
1248 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1249 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1250 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1251
1252 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1253 interruptible,
1254 urgency, deadline, slop,
1255 thread);
1256
1257 waitq_unlock(waitq);
1258 splx(s);
1259 return wresult;
1260 }
1261
1262 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1263 assert_wait_deadline(
1264 event_t event,
1265 wait_interrupt_t interruptible,
1266 uint64_t deadline)
1267 {
1268 thread_t thread = current_thread();
1269 wait_result_t wresult;
1270 spl_t s;
1271
1272 if (__improbable(event == NO_EVENT)) {
1273 panic("%s() called with NO_EVENT", __func__);
1274 }
1275
1276 struct waitq *waitq;
1277 waitq = global_eventq(event);
1278
1279 s = splsched();
1280 waitq_lock(waitq);
1281
1282 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1283 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1284 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1285
1286 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1287 interruptible,
1288 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1289 TIMEOUT_NO_LEEWAY, thread);
1290 waitq_unlock(waitq);
1291 splx(s);
1292 return wresult;
1293 }
1294
1295 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1296 assert_wait_deadline_with_leeway(
1297 event_t event,
1298 wait_interrupt_t interruptible,
1299 wait_timeout_urgency_t urgency,
1300 uint64_t deadline,
1301 uint64_t leeway)
1302 {
1303 thread_t thread = current_thread();
1304 wait_result_t wresult;
1305 spl_t s;
1306
1307 if (__improbable(event == NO_EVENT)) {
1308 panic("%s() called with NO_EVENT", __func__);
1309 }
1310
1311 struct waitq *waitq;
1312 waitq = global_eventq(event);
1313
1314 s = splsched();
1315 waitq_lock(waitq);
1316
1317 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1318 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1319 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1320
1321 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1322 interruptible,
1323 urgency, deadline, leeway,
1324 thread);
1325 waitq_unlock(waitq);
1326 splx(s);
1327 return wresult;
1328 }
1329
1330 /*
1331 * thread_isoncpu:
1332 *
1333 * Return TRUE if a thread is running on a processor such that an AST
1334 * is needed to pull it out of userspace execution, or if executing in
1335 * the kernel, bring to a context switch boundary that would cause
1336 * thread state to be serialized in the thread PCB.
1337 *
1338 * Thread locked, returns the same way. While locked, fields
1339 * like "state" cannot change. "runq" can change only from set to unset.
1340 */
1341 static inline boolean_t
thread_isoncpu(thread_t thread)1342 thread_isoncpu(thread_t thread)
1343 {
1344 /* Not running or runnable */
1345 if (!(thread->state & TH_RUN)) {
1346 return FALSE;
1347 }
1348
1349 /* Waiting on a runqueue, not currently running */
1350 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1351 if (thread->runq != PROCESSOR_NULL) {
1352 return FALSE;
1353 }
1354
1355 /*
1356 * Thread does not have a stack yet
1357 * It could be on the stack alloc queue or preparing to be invoked
1358 */
1359 if (!thread->kernel_stack) {
1360 return FALSE;
1361 }
1362
1363 /*
1364 * Thread must be running on a processor, or
1365 * about to run, or just did run. In all these
1366 * cases, an AST to the processor is needed
1367 * to guarantee that the thread is kicked out
1368 * of userspace and the processor has
1369 * context switched (and saved register state).
1370 */
1371 return TRUE;
1372 }
1373
1374 /*
1375 * thread_stop:
1376 *
1377 * Force a preemption point for a thread and wait
1378 * for it to stop running on a CPU. If a stronger
1379 * guarantee is requested, wait until no longer
1380 * runnable. Arbitrates access among
1381 * multiple stop requests. (released by unstop)
1382 *
1383 * The thread must enter a wait state and stop via a
1384 * separate means.
1385 *
1386 * Returns FALSE if interrupted.
1387 */
1388 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1389 thread_stop(
1390 thread_t thread,
1391 boolean_t until_not_runnable)
1392 {
1393 wait_result_t wresult;
1394 spl_t s = splsched();
1395 boolean_t oncpu;
1396
1397 wake_lock(thread);
1398 thread_lock(thread);
1399
1400 while (thread->state & TH_SUSP) {
1401 thread->wake_active = TRUE;
1402 thread_unlock(thread);
1403
1404 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1405 wake_unlock(thread);
1406 splx(s);
1407
1408 if (wresult == THREAD_WAITING) {
1409 wresult = thread_block(THREAD_CONTINUE_NULL);
1410 }
1411
1412 if (wresult != THREAD_AWAKENED) {
1413 return FALSE;
1414 }
1415
1416 s = splsched();
1417 wake_lock(thread);
1418 thread_lock(thread);
1419 }
1420
1421 thread->state |= TH_SUSP;
1422
1423 while ((oncpu = thread_isoncpu(thread)) ||
1424 (until_not_runnable && (thread->state & TH_RUN))) {
1425 processor_t processor;
1426
1427 if (oncpu) {
1428 assert(thread->state & TH_RUN);
1429 processor = thread->chosen_processor;
1430 cause_ast_check(processor);
1431 }
1432
1433 thread->wake_active = TRUE;
1434 thread_unlock(thread);
1435
1436 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1437 wake_unlock(thread);
1438 splx(s);
1439
1440 if (wresult == THREAD_WAITING) {
1441 wresult = thread_block(THREAD_CONTINUE_NULL);
1442 }
1443
1444 if (wresult != THREAD_AWAKENED) {
1445 thread_unstop(thread);
1446 return FALSE;
1447 }
1448
1449 s = splsched();
1450 wake_lock(thread);
1451 thread_lock(thread);
1452 }
1453
1454 thread_unlock(thread);
1455 wake_unlock(thread);
1456 splx(s);
1457
1458 /*
1459 * We return with the thread unlocked. To prevent it from
1460 * transitioning to a runnable state (or from TH_RUN to
1461 * being on the CPU), the caller must ensure the thread
1462 * is stopped via an external means (such as an AST)
1463 */
1464
1465 return TRUE;
1466 }
1467
1468 /*
1469 * thread_unstop:
1470 *
1471 * Release a previous stop request and set
1472 * the thread running if appropriate.
1473 *
1474 * Use only after a successful stop operation.
1475 */
1476 void
thread_unstop(thread_t thread)1477 thread_unstop(
1478 thread_t thread)
1479 {
1480 spl_t s = splsched();
1481
1482 wake_lock(thread);
1483 thread_lock(thread);
1484
1485 assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1486
1487 if (thread->state & TH_SUSP) {
1488 thread->state &= ~TH_SUSP;
1489
1490 if (thread->wake_active) {
1491 thread->wake_active = FALSE;
1492 thread_unlock(thread);
1493
1494 thread_wakeup(&thread->wake_active);
1495 wake_unlock(thread);
1496 splx(s);
1497
1498 return;
1499 }
1500 }
1501
1502 thread_unlock(thread);
1503 wake_unlock(thread);
1504 splx(s);
1505 }
1506
1507 /*
1508 * thread_wait:
1509 *
1510 * Wait for a thread to stop running. (non-interruptible)
1511 *
1512 */
1513 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1514 thread_wait(
1515 thread_t thread,
1516 boolean_t until_not_runnable)
1517 {
1518 wait_result_t wresult;
1519 boolean_t oncpu;
1520 processor_t processor;
1521 spl_t s = splsched();
1522
1523 wake_lock(thread);
1524 thread_lock(thread);
1525
1526 /*
1527 * Wait until not running on a CPU. If stronger requirement
1528 * desired, wait until not runnable. Assumption: if thread is
1529 * on CPU, then TH_RUN is set, so we're not waiting in any case
1530 * where the original, pure "TH_RUN" check would have let us
1531 * finish.
1532 */
1533 while ((oncpu = thread_isoncpu(thread)) ||
1534 (until_not_runnable && (thread->state & TH_RUN))) {
1535 if (oncpu) {
1536 assert(thread->state & TH_RUN);
1537 processor = thread->chosen_processor;
1538 cause_ast_check(processor);
1539 }
1540
1541 thread->wake_active = TRUE;
1542 thread_unlock(thread);
1543
1544 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1545 wake_unlock(thread);
1546 splx(s);
1547
1548 if (wresult == THREAD_WAITING) {
1549 thread_block(THREAD_CONTINUE_NULL);
1550 }
1551
1552 s = splsched();
1553 wake_lock(thread);
1554 thread_lock(thread);
1555 }
1556
1557 thread_unlock(thread);
1558 wake_unlock(thread);
1559 splx(s);
1560 }
1561
1562 /*
1563 * Routine: clear_wait_internal
1564 *
1565 * Clear the wait condition for the specified thread.
1566 * Start the thread executing if that is appropriate.
1567 * Arguments:
1568 * thread thread to awaken
1569 * result Wakeup result the thread should see
1570 * Conditions:
1571 * At splsched
1572 * the thread is locked.
1573 * Returns:
1574 * KERN_SUCCESS thread was rousted out a wait
1575 * KERN_FAILURE thread was waiting but could not be rousted
1576 * KERN_NOT_WAITING thread was not waiting
1577 */
1578 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1579 clear_wait_internal(
1580 thread_t thread,
1581 wait_result_t wresult)
1582 {
1583 struct waitq *waitq = thread->waitq;
1584 #if !CONFIG_WAITQ_IRQSAFE_ALLOW_INVALID
1585 uint32_t timeout = os_atomic_load(&LockTimeOutUsec, relaxed);
1586 uint32_t i = timeout;
1587
1588 again:
1589 #endif /* !CONFIG_WAITQ_IRQSAFE_ALLOW_INVALID */
1590
1591 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1592 return KERN_FAILURE;
1593 }
1594
1595 if (waitq != NULL && !waitq_pull_thread_locked(waitq, thread)) {
1596 #if !CONFIG_WAITQ_IRQSAFE_ALLOW_INVALID
1597 thread_unlock(thread);
1598 delay(1);
1599 if (timeout > 0 && i > 0 && !machine_timeout_suspended()) {
1600 i--;
1601 }
1602 thread_lock(thread);
1603
1604 if (waitq == thread->waitq) {
1605 if (timeout != 0 && i == 0) {
1606 panic("%s: deadlock: thread=%p, wq=%p, cpu=%d",
1607 __func__, thread, waitq, cpu_number());
1608 }
1609 goto again;
1610 }
1611 #endif /* !CONFIG_WAITQ_IRQSAFE_ALLOW_INVALID */
1612 return KERN_NOT_WAITING;
1613 }
1614
1615 /* TODO: Can we instead assert TH_TERMINATE is not set? */
1616 if ((thread->state & (TH_WAIT | TH_TERMINATE)) != TH_WAIT) {
1617 return KERN_NOT_WAITING;
1618 }
1619
1620 return thread_go(thread, wresult, WQ_OPTION_NONE);
1621 }
1622
1623
1624 /*
1625 * clear_wait:
1626 *
1627 * Clear the wait condition for the specified thread. Start the thread
1628 * executing if that is appropriate.
1629 *
1630 * parameters:
1631 * thread thread to awaken
1632 * result Wakeup result the thread should see
1633 */
1634 kern_return_t
clear_wait(thread_t thread,wait_result_t result)1635 clear_wait(
1636 thread_t thread,
1637 wait_result_t result)
1638 {
1639 kern_return_t ret;
1640 spl_t s;
1641
1642 s = splsched();
1643 thread_lock(thread);
1644 ret = clear_wait_internal(thread, result);
1645 thread_unlock(thread);
1646 splx(s);
1647 return ret;
1648 }
1649
1650
1651 /*
1652 * thread_wakeup_prim:
1653 *
1654 * Common routine for thread_wakeup, thread_wakeup_with_result,
1655 * and thread_wakeup_one.
1656 *
1657 */
1658 kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1659 thread_wakeup_prim(
1660 event_t event,
1661 boolean_t one_thread,
1662 wait_result_t result)
1663 {
1664 if (__improbable(event == NO_EVENT)) {
1665 panic("%s() called with NO_EVENT", __func__);
1666 }
1667
1668 struct waitq *wq = global_eventq(event);
1669
1670 if (one_thread) {
1671 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1672 } else {
1673 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1674 }
1675 }
1676
1677 /*
1678 * Wakeup a specified thread if and only if it's waiting for this event
1679 */
1680 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1681 thread_wakeup_thread(
1682 event_t event,
1683 thread_t thread)
1684 {
1685 if (__improbable(event == NO_EVENT)) {
1686 panic("%s() called with NO_EVENT", __func__);
1687 }
1688
1689 if (__improbable(thread == THREAD_NULL)) {
1690 panic("%s() called with THREAD_NULL", __func__);
1691 }
1692
1693 struct waitq *wq = global_eventq(event);
1694
1695 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1696 }
1697
1698 /*
1699 * Wakeup a thread waiting on an event and promote it to a priority.
1700 *
1701 * Requires woken thread to un-promote itself when done.
1702 */
1703 kern_return_t
thread_wakeup_one_with_pri(event_t event,int priority)1704 thread_wakeup_one_with_pri(
1705 event_t event,
1706 int priority)
1707 {
1708 if (__improbable(event == NO_EVENT)) {
1709 panic("%s() called with NO_EVENT", __func__);
1710 }
1711
1712 struct waitq *wq = global_eventq(event);
1713
1714 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1715 }
1716
1717 /*
1718 * Wakeup a thread waiting on an event,
1719 * promote it to a priority,
1720 * and return a reference to the woken thread.
1721 *
1722 * Requires woken thread to un-promote itself when done.
1723 */
1724 thread_t
thread_wakeup_identify(event_t event,int priority)1725 thread_wakeup_identify(event_t event,
1726 int priority)
1727 {
1728 if (__improbable(event == NO_EVENT)) {
1729 panic("%s() called with NO_EVENT", __func__);
1730 }
1731
1732 struct waitq *wq = global_eventq(event);
1733
1734 return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1735 }
1736
1737 /*
1738 * thread_bind:
1739 *
1740 * Force the current thread to execute on the specified processor.
1741 * Takes effect after the next thread_block().
1742 *
1743 * Returns the previous binding. PROCESSOR_NULL means
1744 * not bound.
1745 *
1746 * XXX - DO NOT export this to users - XXX
1747 */
1748 processor_t
thread_bind(processor_t processor)1749 thread_bind(
1750 processor_t processor)
1751 {
1752 thread_t self = current_thread();
1753 processor_t prev;
1754 spl_t s;
1755
1756 s = splsched();
1757 thread_lock(self);
1758
1759 prev = thread_bind_internal(self, processor);
1760
1761 thread_unlock(self);
1762 splx(s);
1763
1764 return prev;
1765 }
1766
1767 /*
1768 * thread_bind_internal:
1769 *
1770 * If the specified thread is not the current thread, and it is currently
1771 * running on another CPU, a remote AST must be sent to that CPU to cause
1772 * the thread to migrate to its bound processor. Otherwise, the migration
1773 * will occur at the next quantum expiration or blocking point.
1774 *
1775 * When the thread is the current thread, and explicit thread_block() should
1776 * be used to force the current processor to context switch away and
1777 * let the thread migrate to the bound processor.
1778 *
1779 * Thread must be locked, and at splsched.
1780 */
1781
1782 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)1783 thread_bind_internal(
1784 thread_t thread,
1785 processor_t processor)
1786 {
1787 processor_t prev;
1788
1789 /* <rdar://problem/15102234> */
1790 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1791 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1792 assert(thread->runq == PROCESSOR_NULL);
1793
1794 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1795
1796 prev = thread->bound_processor;
1797 thread->bound_processor = processor;
1798
1799 return prev;
1800 }
1801
1802 /*
1803 * thread_vm_bind_group_add:
1804 *
1805 * The "VM bind group" is a special mechanism to mark a collection
1806 * of threads from the VM subsystem that, in general, should be scheduled
1807 * with only one CPU of parallelism. To accomplish this, we initially
1808 * bind all the threads to the master processor, which has the effect
1809 * that only one of the threads in the group can execute at once, including
1810 * preempting threads in the group that are a lower priority. Future
1811 * mechanisms may use more dynamic mechanisms to prevent the collection
1812 * of VM threads from using more CPU time than desired.
1813 *
1814 * The current implementation can result in priority inversions where
1815 * compute-bound priority 95 or realtime threads that happen to have
1816 * landed on the master processor prevent the VM threads from running.
1817 * When this situation is detected, we unbind the threads for one
1818 * scheduler tick to allow the scheduler to run the threads an
1819 * additional CPUs, before restoring the binding (assuming high latency
1820 * is no longer a problem).
1821 */
1822
1823 /*
1824 * The current max is provisioned for:
1825 * vm_compressor_swap_trigger_thread (92)
1826 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1827 * vm_pageout_continue (92)
1828 * memorystatus_thread (95)
1829 */
1830 #define MAX_VM_BIND_GROUP_COUNT (5)
1831 decl_simple_lock_data(static, sched_vm_group_list_lock);
1832 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1833 static int sched_vm_group_thread_count;
1834 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1835
1836 void
thread_vm_bind_group_add(void)1837 thread_vm_bind_group_add(void)
1838 {
1839 thread_t self = current_thread();
1840
1841 thread_reference(self);
1842 self->options |= TH_OPT_SCHED_VM_GROUP;
1843
1844 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1845 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1846 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1847 simple_unlock(&sched_vm_group_list_lock);
1848
1849 thread_bind(master_processor);
1850
1851 /* Switch to bound processor if not already there */
1852 thread_block(THREAD_CONTINUE_NULL);
1853 }
1854
1855 static void
sched_vm_group_maintenance(void)1856 sched_vm_group_maintenance(void)
1857 {
1858 uint64_t ctime = mach_absolute_time();
1859 uint64_t longtime = ctime - sched_tick_interval;
1860 int i;
1861 spl_t s;
1862 boolean_t high_latency_observed = FALSE;
1863 boolean_t runnable_and_not_on_runq_observed = FALSE;
1864 boolean_t bind_target_changed = FALSE;
1865 processor_t bind_target = PROCESSOR_NULL;
1866
1867 /* Make sure nobody attempts to add new threads while we are enumerating them */
1868 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1869
1870 s = splsched();
1871
1872 for (i = 0; i < sched_vm_group_thread_count; i++) {
1873 thread_t thread = sched_vm_group_thread_list[i];
1874 assert(thread != THREAD_NULL);
1875 thread_lock(thread);
1876 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
1877 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1878 high_latency_observed = TRUE;
1879 } else if (thread->runq == PROCESSOR_NULL) {
1880 /* There are some cases where a thread be transitiong that also fall into this case */
1881 runnable_and_not_on_runq_observed = TRUE;
1882 }
1883 }
1884 thread_unlock(thread);
1885
1886 if (high_latency_observed && runnable_and_not_on_runq_observed) {
1887 /* All the things we are looking for are true, stop looking */
1888 break;
1889 }
1890 }
1891
1892 splx(s);
1893
1894 if (sched_vm_group_temporarily_unbound) {
1895 /* If we turned off binding, make sure everything is OK before rebinding */
1896 if (!high_latency_observed) {
1897 /* rebind */
1898 bind_target_changed = TRUE;
1899 bind_target = master_processor;
1900 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1901 }
1902 } else {
1903 /*
1904 * Check if we're in a bad state, which is defined by high
1905 * latency with no core currently executing a thread. If a
1906 * single thread is making progress on a CPU, that means the
1907 * binding concept to reduce parallelism is working as
1908 * designed.
1909 */
1910 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1911 /* unbind */
1912 bind_target_changed = TRUE;
1913 bind_target = PROCESSOR_NULL;
1914 sched_vm_group_temporarily_unbound = TRUE;
1915 }
1916 }
1917
1918 if (bind_target_changed) {
1919 s = splsched();
1920 for (i = 0; i < sched_vm_group_thread_count; i++) {
1921 thread_t thread = sched_vm_group_thread_list[i];
1922 boolean_t removed;
1923 assert(thread != THREAD_NULL);
1924
1925 thread_lock(thread);
1926 removed = thread_run_queue_remove(thread);
1927 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1928 thread_bind_internal(thread, bind_target);
1929 } else {
1930 /*
1931 * Thread was in the middle of being context-switched-to,
1932 * or was in the process of blocking. To avoid switching the bind
1933 * state out mid-flight, defer the change if possible.
1934 */
1935 if (bind_target == PROCESSOR_NULL) {
1936 thread_bind_internal(thread, bind_target);
1937 } else {
1938 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1939 }
1940 }
1941
1942 if (removed) {
1943 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1944 }
1945 thread_unlock(thread);
1946 }
1947 splx(s);
1948 }
1949
1950 simple_unlock(&sched_vm_group_list_lock);
1951 }
1952
1953 #if defined(__x86_64__)
1954 #define SCHED_AVOID_CPU0 1
1955 #else
1956 #define SCHED_AVOID_CPU0 0
1957 #endif
1958
1959 int sched_allow_rt_smt = 1;
1960 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
1961 int sched_choose_first_fd_processor = 1;
1962 int sched_allow_rt_steal = 1;
1963 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
1964
1965 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
1966
1967 int
sched_get_rt_n_backup_processors(void)1968 sched_get_rt_n_backup_processors(void)
1969 {
1970 return sched_rt_n_backup_processors;
1971 }
1972
1973 void
sched_set_rt_n_backup_processors(int n)1974 sched_set_rt_n_backup_processors(int n)
1975 {
1976 if (n < 0) {
1977 n = 0;
1978 } else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
1979 n = SCHED_MAX_BACKUP_PROCESSORS;
1980 }
1981
1982 sched_rt_n_backup_processors = n;
1983 }
1984
1985 int sched_rt_runq_strict_priority = false;
1986
1987 inline static processor_set_t
change_locked_pset(processor_set_t current_pset,processor_set_t new_pset)1988 change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
1989 {
1990 if (current_pset != new_pset) {
1991 pset_unlock(current_pset);
1992 pset_lock(new_pset);
1993 }
1994
1995 return new_pset;
1996 }
1997
1998 /*
1999 * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2000 * rebalancing opportunity exists when a core is (instantaneously) idle, but
2001 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
2002 * IPI thrash if this core does not remain idle following the load balancing ASTs
2003 * Idle "thrash", when IPI issue is followed by idle entry/core power down
2004 * followed by a wakeup shortly thereafter.
2005 */
2006
2007 #if (DEVELOPMENT || DEBUG)
2008 int sched_smt_balance = 1;
2009 #endif
2010
2011 /* Invoked with pset locked, returns with pset unlocked */
2012 void
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)2013 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2014 {
2015 processor_t ast_processor = NULL;
2016
2017 #if (DEVELOPMENT || DEBUG)
2018 if (__improbable(sched_smt_balance == 0)) {
2019 goto smt_balance_exit;
2020 }
2021 #endif
2022
2023 assert(cprocessor == current_processor());
2024 if (cprocessor->is_SMT == FALSE) {
2025 goto smt_balance_exit;
2026 }
2027
2028 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2029
2030 /* Determine if both this processor and its sibling are idle,
2031 * indicating an SMT rebalancing opportunity.
2032 */
2033 if (sib_processor->state != PROCESSOR_IDLE) {
2034 goto smt_balance_exit;
2035 }
2036
2037 processor_t sprocessor;
2038
2039 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2040 uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2041 ~cpset->primary_map);
2042 for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2043 sprocessor = processor_array[cpuid];
2044 if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2045 (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2046 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2047 if (ipi_type != SCHED_IPI_NONE) {
2048 assert(sprocessor != cprocessor);
2049 ast_processor = sprocessor;
2050 break;
2051 }
2052 }
2053 }
2054
2055 smt_balance_exit:
2056 pset_unlock(cpset);
2057
2058 if (ast_processor) {
2059 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2060 sched_ipi_perform(ast_processor, ipi_type);
2061 }
2062 }
2063
2064 static cpumap_t
pset_available_cpumap(processor_set_t pset)2065 pset_available_cpumap(processor_set_t pset)
2066 {
2067 return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]) &
2068 pset->recommended_bitmask;
2069 }
2070
2071 int
pset_available_cpu_count(processor_set_t pset)2072 pset_available_cpu_count(processor_set_t pset)
2073 {
2074 return bit_count(pset_available_cpumap(pset));
2075 }
2076
2077 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2078 pset_available_but_not_running_cpumap(processor_set_t pset)
2079 {
2080 return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2081 pset->recommended_bitmask;
2082 }
2083
2084 bool
pset_has_stealable_threads(processor_set_t pset)2085 pset_has_stealable_threads(processor_set_t pset)
2086 {
2087 pset_assert_locked(pset);
2088
2089 cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2090 /*
2091 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2092 * available primary CPUs
2093 */
2094 avail_map &= pset->primary_map;
2095
2096 return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2097 }
2098
2099 static cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)2100 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2101 {
2102 cpumap_t avail_map = pset_available_cpumap(pset);
2103 if (!sched_allow_rt_smt) {
2104 /*
2105 * Secondary CPUs are not allowed to run RT threads, so
2106 * only primary CPUs should be included
2107 */
2108 avail_map &= pset->primary_map;
2109 }
2110
2111 return avail_map & ~pset->realtime_map;
2112 }
2113
2114 static bool
pset_needs_a_followup_IPI(processor_set_t pset)2115 pset_needs_a_followup_IPI(processor_set_t pset)
2116 {
2117 int nbackup_cpus = 0;
2118
2119 if (rt_runq_is_low_latency(pset)) {
2120 nbackup_cpus = sched_rt_n_backup_processors;
2121 }
2122
2123 int rt_rq_count = rt_runq_count(pset);
2124
2125 return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
2126 }
2127
2128 bool
pset_has_stealable_rt_threads(processor_set_t pset)2129 pset_has_stealable_rt_threads(processor_set_t pset)
2130 {
2131 pset_node_t node = pset->node;
2132 if (bit_count(node->pset_map) == 1) {
2133 return false;
2134 }
2135
2136 cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2137
2138 return rt_runq_count(pset) > bit_count(avail_map);
2139 }
2140
2141 static void
pset_update_rt_stealable_state(processor_set_t pset)2142 pset_update_rt_stealable_state(processor_set_t pset)
2143 {
2144 if (pset_has_stealable_rt_threads(pset)) {
2145 pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2146 } else {
2147 pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2148 }
2149 }
2150
2151 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2152 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2153 {
2154 /* Acknowledge any pending IPIs here with pset lock held */
2155 pset_assert_locked(pset);
2156 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2157 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2158 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2159 }
2160 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2161
2162 #if defined(CONFIG_SCHED_DEFERRED_AST)
2163 bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2164 #endif
2165 }
2166
2167 /*
2168 * Called with pset locked, on a processor that is committing to run a new thread
2169 * Will transition an idle or dispatching processor to running as it picks up
2170 * the first new thread from the idle thread.
2171 */
2172 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2173 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2174 {
2175 pset_assert_locked(pset);
2176
2177 if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2178 assert(current_thread() == processor->idle_thread);
2179
2180 /*
2181 * Dispatching processor is now committed to running new_thread,
2182 * so change its state to PROCESSOR_RUNNING.
2183 */
2184 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2185 } else {
2186 assert((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_SHUTDOWN));
2187 }
2188
2189 processor_state_update_from_thread(processor, new_thread, true);
2190
2191 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2192 bit_set(pset->realtime_map, processor->cpu_id);
2193 } else {
2194 bit_clear(pset->realtime_map, processor->cpu_id);
2195 }
2196 pset_update_rt_stealable_state(pset);
2197
2198 pset_node_t node = pset->node;
2199
2200 if (bit_count(node->pset_map) == 1) {
2201 /* Node has only a single pset, so skip node pset map updates */
2202 return;
2203 }
2204
2205 cpumap_t avail_map = pset_available_cpumap(pset);
2206
2207 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2208 if ((avail_map & pset->realtime_map) == avail_map) {
2209 /* No more non-RT CPUs in this pset */
2210 atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2211 }
2212 avail_map &= pset->primary_map;
2213 if ((avail_map & pset->realtime_map) == avail_map) {
2214 /* No more non-RT primary CPUs in this pset */
2215 atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2216 }
2217 } else {
2218 if ((avail_map & pset->realtime_map) != avail_map) {
2219 if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2220 atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2221 }
2222 }
2223 avail_map &= pset->primary_map;
2224 if ((avail_map & pset->realtime_map) != avail_map) {
2225 if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2226 atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2227 }
2228 }
2229 }
2230 }
2231
2232 static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2233 static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills);
2234 static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2235 #if defined(__x86_64__)
2236 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2237 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2238 #endif
2239 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2240 static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2241
2242 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2243 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2244 {
2245 pset_map_t pset_map = stealing_pset->node->pset_map;
2246
2247 bit_clear(pset_map, stealing_pset->pset_id);
2248
2249 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2250 processor_set_t nset = pset_array[pset_id];
2251
2252 if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) {
2253 return true;
2254 }
2255 }
2256
2257 return false;
2258 }
2259
2260 /*
2261 * starting_pset must be locked, but returns true if it is unlocked before return
2262 */
2263 static bool
choose_next_rt_processor_for_IPI(processor_set_t starting_pset,processor_t chosen_processor,bool spill_ipi,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)2264 choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2265 processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2266 {
2267 bool starting_pset_is_unlocked = false;
2268 uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
2269 int max_pri = rt_runq_priority(starting_pset);
2270 __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
2271 if (rt_constraint_ll != 0) {
2272 uint64_t ctime = mach_absolute_time();
2273 if (earliest_deadline < rt_constraint_ll + ctime) {
2274 earliest_deadline = rt_constraint_ll + ctime;
2275 }
2276 }
2277 processor_set_t pset = starting_pset;
2278 processor_t next_rt_processor = PROCESSOR_NULL;
2279 if (spill_ipi) {
2280 processor_set_t nset = next_pset(pset);
2281 assert(nset != starting_pset);
2282 pset = change_locked_pset(pset, nset);
2283 starting_pset_is_unlocked = true;
2284 }
2285 do {
2286 const bool consider_secondaries = true;
2287 next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries);
2288 if (next_rt_processor == PROCESSOR_NULL) {
2289 if (!spill_ipi) {
2290 break;
2291 }
2292 processor_set_t nset = next_pset(pset);
2293 if (nset == starting_pset) {
2294 break;
2295 }
2296 pset = change_locked_pset(pset, nset);
2297 starting_pset_is_unlocked = true;
2298 }
2299 } while (next_rt_processor == PROCESSOR_NULL);
2300 if (next_rt_processor) {
2301 if (pset != starting_pset) {
2302 if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2303 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2304 next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2305 }
2306 }
2307 *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
2308 *result_processor = next_rt_processor;
2309 }
2310 if (pset != starting_pset) {
2311 pset_unlock(pset);
2312 }
2313
2314 return starting_pset_is_unlocked;
2315 }
2316
2317 /*
2318 * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2319 * followup processor - used in thread_select when there are still threads on the run queue and available processors
2320 * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2321 */
2322 typedef enum {
2323 none,
2324 backup,
2325 followup,
2326 spill
2327 } next_processor_type_t;
2328
2329 #undef LOOP_COUNT
2330 #ifdef LOOP_COUNT
2331 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2332 #endif
2333
2334 /*
2335 * thread_select:
2336 *
2337 * Select a new thread for the current processor to execute.
2338 *
2339 * May select the current thread, which must be locked.
2340 */
2341 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2342 thread_select(thread_t thread,
2343 processor_t processor,
2344 ast_t *reason)
2345 {
2346 processor_set_t pset = processor->processor_set;
2347 thread_t new_thread = THREAD_NULL;
2348
2349 assert(processor == current_processor());
2350 assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2351
2352 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2353 0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2354
2355 __kdebug_only int idle_reason = 0;
2356 __kdebug_only int delay_count = 0;
2357
2358 #if defined(__x86_64__)
2359 int timeout_count = sched_backup_cpu_timeout_count;
2360 if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2361 /* Prefer cpu0 as backup */
2362 timeout_count--;
2363 } else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2364 /* Prefer secondary cpu as backup */
2365 timeout_count--;
2366 }
2367 #endif
2368 bool pending_AST_URGENT = false;
2369 bool pending_AST_PREEMPT = false;
2370
2371 #ifdef LOOP_COUNT
2372 int loop_count = -1;
2373 #endif
2374
2375 do {
2376 /*
2377 * Update the priority.
2378 */
2379 if (SCHED(can_update_priority)(thread)) {
2380 SCHED(update_priority)(thread);
2381 }
2382
2383 pset_lock(pset);
2384
2385 restart:
2386 #ifdef LOOP_COUNT
2387 loop_count++;
2388 if (loop_count > max_loop_count[processor->cpu_id]) {
2389 max_loop_count[processor->cpu_id] = loop_count;
2390 if (bit_count(loop_count) == 1) {
2391 kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2392 }
2393 }
2394 #endif
2395 pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2396 pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2397
2398 processor_state_update_from_thread(processor, thread, true);
2399
2400 idle_reason = 0;
2401
2402 processor_t ast_processor = PROCESSOR_NULL;
2403 processor_t next_rt_processor = PROCESSOR_NULL;
2404 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2405 sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2406
2407 assert(processor->state != PROCESSOR_OFF_LINE);
2408
2409 /*
2410 * Bound threads are dispatched to a processor without going through
2411 * choose_processor(), so in those cases we must continue trying to dequeue work
2412 * as we are the only option.
2413 */
2414 if (!SCHED(processor_bound_count)(processor)) {
2415 if (!processor->is_recommended) {
2416 /*
2417 * The performance controller has provided a hint to not dispatch more threads,
2418 */
2419 idle_reason = 1;
2420 goto send_followup_ipi_before_idle;
2421 } else if (rt_runq_count(pset)) {
2422 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2423 /* Give the current RT thread a chance to complete */
2424 ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2425 #if defined(__x86_64__)
2426 /*
2427 * On Intel we want to avoid SMT secondary processors and processor 0
2428 * but allow them to be used as backup processors in case the preferred chosen
2429 * processor is delayed by interrupts or processor stalls. So if it is
2430 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2431 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2432 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2433 * to grab the thread before the (current) backup processor does.
2434 *
2435 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2436 * on DEVELOPMENT || DEBUG kernels. It is also adjusted (see above) depending on whether we want to use
2437 * cpu0 before secondary cpus or not.
2438 */
2439 if (!ok_to_run_realtime_thread) {
2440 if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2441 if (timeout_count-- > 0) {
2442 pset_unlock(pset);
2443 thread_unlock(thread);
2444 delay(10);
2445 delay_count++;
2446 thread_lock(thread);
2447 pset_lock(pset);
2448 goto restart;
2449 }
2450 ok_to_run_realtime_thread = true;
2451 }
2452 }
2453 #endif
2454 if (!ok_to_run_realtime_thread) {
2455 idle_reason = 2;
2456 goto send_followup_ipi_before_idle;
2457 }
2458 } else if (processor->processor_primary != processor) {
2459 /*
2460 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2461 * we should look for work only under the same conditions that choose_processor()
2462 * would have assigned work, which is when all primary processors have been assigned work.
2463 */
2464 if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2465 /* There are idle primaries */
2466 idle_reason = 3;
2467 goto idle;
2468 }
2469 }
2470 }
2471
2472 /*
2473 * Test to see if the current thread should continue
2474 * to run on this processor. Must not be attempting to wait, and not
2475 * bound to a different processor, nor be in the wrong
2476 * processor set, nor be forced to context switch by TH_SUSP.
2477 *
2478 * Note that there are never any RT threads in the regular runqueue.
2479 *
2480 * This code is very insanely tricky.
2481 */
2482
2483 /* i.e. not waiting, not TH_SUSP'ed */
2484 bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2485
2486 /*
2487 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2488 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2489 * <rdar://problem/47907700>
2490 *
2491 * A yielding thread shouldn't be forced to context switch.
2492 */
2493
2494 bool is_yielding = (*reason & AST_YIELD) == AST_YIELD;
2495
2496 bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2497
2498 bool affinity_mismatch = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2499
2500 bool bound_elsewhere = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2501
2502 bool avoid_processor = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread);
2503
2504 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2505
2506 bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2507 if (current_thread_can_keep_running) {
2508 /*
2509 * This thread is eligible to keep running on this processor.
2510 *
2511 * RT threads with un-expired quantum stay on processor,
2512 * unless there's a valid RT thread with an earlier deadline
2513 * and it is still ok_to_run_realtime_thread.
2514 */
2515 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2516 /*
2517 * Allow low latency realtime threads to keep running.
2518 * Pick a new RT thread only if ok_to_run_realtime_thread
2519 * (but the current thread is allowed to complete).
2520 */
2521 if ((thread->realtime.constraint > rt_constraint_ll) && ok_to_run_realtime_thread) {
2522 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2523 goto pick_new_rt_thread;
2524 }
2525 if (rt_runq_priority(pset) > thread->sched_pri) {
2526 if (sched_rt_runq_strict_priority) {
2527 /* The next RT thread is better, so pick it off the runqueue. */
2528 goto pick_new_rt_thread;
2529 }
2530
2531 /*
2532 * See if the current lower priority thread can continue to run without causing
2533 * the higher priority thread on the runq queue to miss its deadline.
2534 */
2535 thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2536 if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2537 /* The next RT thread is better, so pick it off the runqueue. */
2538 goto pick_new_rt_thread;
2539 }
2540 } else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2541 /* The next RT thread is better, so pick it off the runqueue. */
2542 goto pick_new_rt_thread;
2543 }
2544 if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2545 goto pick_new_rt_thread;
2546 }
2547 }
2548
2549 /* This is still the best RT thread to run. */
2550 processor->deadline = thread->realtime.deadline;
2551
2552 sched_update_pset_load_average(pset, 0);
2553
2554 clear_pending_AST_bits(pset, processor, 1);
2555
2556 next_rt_processor = PROCESSOR_NULL;
2557 next_rt_ipi_type = SCHED_IPI_NONE;
2558
2559 bool pset_unlocked = false;
2560 __kdebug_only next_processor_type_t nptype = none;
2561 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2562 nptype = spill;
2563 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2564 } else if (pset_needs_a_followup_IPI(pset)) {
2565 nptype = followup;
2566 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2567 }
2568 if (!pset_unlocked) {
2569 pset_unlock(pset);
2570 }
2571
2572 if (next_rt_processor) {
2573 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2574 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2575 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2576 }
2577
2578 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2579 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2580 return thread;
2581 }
2582
2583 if ((rt_runq_count(pset) == 0) &&
2584 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2585 /* This thread is still the highest priority runnable (non-idle) thread */
2586 processor->deadline = RT_DEADLINE_NONE;
2587
2588 sched_update_pset_load_average(pset, 0);
2589
2590 clear_pending_AST_bits(pset, processor, 2);
2591
2592 pset_unlock(pset);
2593
2594 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2595 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2596 return thread;
2597 }
2598 } else {
2599 /*
2600 * This processor must context switch.
2601 * If it's due to a rebalance, we should aggressively find this thread a new home.
2602 */
2603 if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) {
2604 *reason |= AST_REBALANCE;
2605 }
2606 }
2607
2608 bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2609 (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2610 (processor->processor_secondary->state == PROCESSOR_IDLE));
2611
2612 /* OK, so we're not going to run the current thread. Look at the RT queue. */
2613 if (ok_to_run_realtime_thread) {
2614 pick_new_rt_thread:
2615 new_thread = sched_rt_choose_thread(pset);
2616 if (new_thread != THREAD_NULL) {
2617 processor->deadline = new_thread->realtime.deadline;
2618 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2619
2620 clear_pending_AST_bits(pset, processor, 3);
2621
2622 if (processor->processor_secondary != NULL) {
2623 processor_t sprocessor = processor->processor_secondary;
2624 if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2625 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2626 ast_processor = sprocessor;
2627 }
2628 }
2629 }
2630 }
2631
2632 send_followup_ipi_before_idle:
2633 /* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2634 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2635 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2636 }
2637 __kdebug_only next_processor_type_t nptype = none;
2638 bool pset_unlocked = false;
2639 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2640 nptype = spill;
2641 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2642 } else if (pset_needs_a_followup_IPI(pset)) {
2643 nptype = followup;
2644 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2645 }
2646
2647 assert(new_thread || !ast_processor);
2648 if (new_thread || next_rt_processor) {
2649 if (!pset_unlocked) {
2650 pset_unlock(pset);
2651 pset_unlocked = true;
2652 }
2653 if (ast_processor == next_rt_processor) {
2654 ast_processor = PROCESSOR_NULL;
2655 ipi_type = SCHED_IPI_NONE;
2656 }
2657
2658 if (ast_processor) {
2659 sched_ipi_perform(ast_processor, ipi_type);
2660 }
2661
2662 if (next_rt_processor) {
2663 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2664 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2665 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2666 }
2667
2668 if (new_thread) {
2669 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2670 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2671 return new_thread;
2672 }
2673 }
2674
2675 if (pset_unlocked) {
2676 pset_lock(pset);
2677 }
2678
2679 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2680 /* Things changed while we dropped the lock */
2681 goto restart;
2682 }
2683
2684 if (processor->is_recommended) {
2685 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2686 if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2687 /* Things changed while we dropped the lock */
2688 goto restart;
2689 }
2690
2691 if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2692 /* secondary can only run realtime thread */
2693 if (idle_reason == 0) {
2694 idle_reason = 4;
2695 }
2696 goto idle;
2697 }
2698 } else if (!SCHED(processor_bound_count)(processor)) {
2699 /* processor not recommended and no bound threads */
2700 if (idle_reason == 0) {
2701 idle_reason = 5;
2702 }
2703 goto idle;
2704 }
2705
2706 processor->deadline = RT_DEADLINE_NONE;
2707
2708 /* No RT threads, so let's look at the regular threads. */
2709 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2710 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2711
2712 clear_pending_AST_bits(pset, processor, 4);
2713
2714 ast_processor = PROCESSOR_NULL;
2715 ipi_type = SCHED_IPI_NONE;
2716
2717 processor_t sprocessor = processor->processor_secondary;
2718 if (sprocessor != NULL) {
2719 if (sprocessor->state == PROCESSOR_RUNNING) {
2720 if (thread_no_smt(new_thread)) {
2721 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2722 ast_processor = sprocessor;
2723 }
2724 } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
2725 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
2726 ast_processor = sprocessor;
2727 }
2728 }
2729 pset_unlock(pset);
2730
2731 if (ast_processor) {
2732 sched_ipi_perform(ast_processor, ipi_type);
2733 }
2734 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2735 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2736 return new_thread;
2737 }
2738
2739 if (processor->must_idle) {
2740 processor->must_idle = false;
2741 *reason |= AST_REBALANCE;
2742 idle_reason = 6;
2743 goto idle;
2744 }
2745
2746 if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2747 /*
2748 * No runnable threads, attempt to steal
2749 * from other processors. Returns with pset lock dropped.
2750 */
2751
2752 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2753 pset_lock(pset);
2754 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2755 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2756 /*
2757 * A realtime thread choose this processor while it was DISPATCHING
2758 * and the pset lock was dropped
2759 */
2760 ast_on(AST_URGENT | AST_PREEMPT);
2761 }
2762
2763 clear_pending_AST_bits(pset, processor, 5);
2764
2765 pset_unlock(pset);
2766
2767 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2768 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2769 return new_thread;
2770 }
2771
2772 /*
2773 * If other threads have appeared, shortcut
2774 * around again.
2775 */
2776 if (SCHED(processor_bound_count)(processor)) {
2777 continue;
2778 }
2779 if (processor->is_recommended) {
2780 if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2781 continue;
2782 }
2783 }
2784
2785 pset_lock(pset);
2786 }
2787
2788 idle:
2789 /* Someone selected this processor while we had dropped the lock */
2790 if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2791 (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2792 goto restart;
2793 }
2794
2795 if ((idle_reason == 0) && current_thread_can_keep_running) {
2796 /* This thread is the only runnable (non-idle) thread */
2797 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2798 processor->deadline = thread->realtime.deadline;
2799 } else {
2800 processor->deadline = RT_DEADLINE_NONE;
2801 }
2802
2803 sched_update_pset_load_average(pset, 0);
2804
2805 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2806 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2807 pset_unlock(pset);
2808 return thread;
2809 }
2810
2811 /*
2812 * Nothing is runnable, or this processor must be forced idle,
2813 * so set this processor idle if it was running.
2814 */
2815 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2816 pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2817 processor_state_update_idle(processor);
2818 }
2819 pset_update_rt_stealable_state(pset);
2820
2821 clear_pending_AST_bits(pset, processor, 6);
2822
2823 /* Invoked with pset locked, returns with pset unlocked */
2824 SCHED(processor_balance)(processor, pset);
2825
2826 new_thread = processor->idle_thread;
2827 } while (new_thread == THREAD_NULL);
2828
2829 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2830 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2831 return new_thread;
2832 }
2833
2834 /*
2835 * thread_invoke
2836 *
2837 * Called at splsched with neither thread locked.
2838 *
2839 * Perform a context switch and start executing the new thread.
2840 *
2841 * Returns FALSE when the context switch didn't happen.
2842 * The reference to the new thread is still consumed.
2843 *
2844 * "self" is what is currently running on the processor,
2845 * "thread" is the new thread to context switch to
2846 * (which may be the same thread in some cases)
2847 */
2848 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)2849 thread_invoke(
2850 thread_t self,
2851 thread_t thread,
2852 ast_t reason)
2853 {
2854 if (__improbable(get_preemption_level() != 0)) {
2855 int pl = get_preemption_level();
2856 panic("thread_invoke: preemption_level %d, possible cause: %s",
2857 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2858 "blocking while holding a spinlock, or within interrupt context"));
2859 }
2860
2861 thread_continue_t continuation = self->continuation;
2862 void *parameter = self->parameter;
2863
2864 uint64_t ctime = mach_absolute_time();
2865
2866 check_monotonic_time(ctime);
2867
2868 #ifdef CONFIG_MACH_APPROXIMATE_TIME
2869 commpage_update_mach_approximate_time(ctime);
2870 #endif
2871
2872 if (ctime < thread->last_made_runnable_time) {
2873 panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
2874 ctime, thread->last_made_runnable_time);
2875 }
2876
2877 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
2878 if (!((thread->state & TH_IDLE) != 0 ||
2879 ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
2880 sched_timeshare_consider_maintenance(ctime);
2881 }
2882 #endif
2883
2884 #if MONOTONIC
2885 mt_sched_update(self);
2886 #endif /* MONOTONIC */
2887
2888 assert_thread_magic(self);
2889 assert(self == current_thread());
2890 assert(self->runq == PROCESSOR_NULL);
2891 assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2892
2893 thread_lock(thread);
2894
2895 assert_thread_magic(thread);
2896 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
2897 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2898 assert(thread->runq == PROCESSOR_NULL);
2899
2900 /* Reload precise timing global policy to thread-local policy */
2901 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2902
2903 /* Update SFI class based on other factors */
2904 thread->sfi_class = sfi_thread_classify(thread);
2905
2906 /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
2907 thread->same_pri_latency = ctime - thread->last_basepri_change_time;
2908 /*
2909 * In case a base_pri update happened between the timestamp and
2910 * taking the thread lock
2911 */
2912 if (ctime <= thread->last_basepri_change_time) {
2913 thread->same_pri_latency = ctime - thread->last_made_runnable_time;
2914 }
2915
2916 /* Allow realtime threads to hang onto a stack. */
2917 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
2918 self->reserved_stack = self->kernel_stack;
2919 }
2920
2921 /* Prepare for spin debugging */
2922 #if INTERRUPT_MASKED_DEBUG
2923 ml_spin_debug_clear(thread);
2924 #endif
2925
2926 if (continuation != NULL) {
2927 if (!thread->kernel_stack) {
2928 /*
2929 * If we are using a privileged stack,
2930 * check to see whether we can exchange it with
2931 * that of the other thread.
2932 */
2933 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
2934 goto need_stack;
2935 }
2936
2937 /*
2938 * Context switch by performing a stack handoff.
2939 * Requires both threads to be parked in a continuation.
2940 */
2941 continuation = thread->continuation;
2942 parameter = thread->parameter;
2943
2944 processor_t processor = current_processor();
2945 processor->active_thread = thread;
2946 processor_state_update_from_thread(processor, thread, false);
2947
2948 if (thread->last_processor != processor && thread->last_processor != NULL) {
2949 if (thread->last_processor->processor_set != processor->processor_set) {
2950 thread->ps_switch++;
2951 }
2952 thread->p_switch++;
2953 }
2954 thread->last_processor = processor;
2955 thread->c_switch++;
2956 ast_context(thread);
2957
2958 thread_unlock(thread);
2959
2960 self->reason = reason;
2961
2962 processor->last_dispatch = ctime;
2963 self->last_run_time = ctime;
2964 processor_timer_switch_thread(ctime, &thread->system_timer);
2965 timer_update(&thread->runnable_timer, ctime);
2966 processor->kernel_timer = &thread->system_timer;
2967
2968 /*
2969 * Since non-precise user/kernel time doesn't update the state timer
2970 * during privilege transitions, synthesize an event now.
2971 */
2972 if (!thread->precise_user_kernel_time) {
2973 timer_update(processor->current_state, ctime);
2974 }
2975
2976 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2977 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
2978 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2979
2980 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2981 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
2982 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2983 }
2984
2985 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
2986
2987 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2988
2989 #if KPERF
2990 kperf_off_cpu(self);
2991 #endif /* KPERF */
2992
2993 /*
2994 * This is where we actually switch thread identity,
2995 * and address space if required. However, register
2996 * state is not switched - this routine leaves the
2997 * stack and register state active on the current CPU.
2998 */
2999 TLOG(1, "thread_invoke: calling stack_handoff\n");
3000 stack_handoff(self, thread);
3001
3002 /* 'self' is now off core */
3003 assert(thread == current_thread_volatile());
3004
3005 DTRACE_SCHED(on__cpu);
3006
3007 #if KPERF
3008 kperf_on_cpu(thread, continuation, NULL);
3009 #endif /* KPERF */
3010
3011 thread_dispatch(self, thread);
3012
3013 #if KASAN
3014 /* Old thread's stack has been moved to the new thread, so explicitly
3015 * unpoison it. */
3016 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3017 #endif
3018
3019 thread->continuation = thread->parameter = NULL;
3020
3021 boolean_t enable_interrupts = TRUE;
3022
3023 /* idle thread needs to stay interrupts-disabled */
3024 if ((thread->state & TH_IDLE)) {
3025 enable_interrupts = FALSE;
3026 }
3027
3028 assert(continuation);
3029 call_continuation(continuation, parameter,
3030 thread->wait_result, enable_interrupts);
3031 /*NOTREACHED*/
3032 } else if (thread == self) {
3033 /* same thread but with continuation */
3034 ast_context(self);
3035
3036 thread_unlock(self);
3037
3038 #if KPERF
3039 kperf_on_cpu(thread, continuation, NULL);
3040 #endif /* KPERF */
3041
3042 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3043 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3044 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3045
3046 #if KASAN
3047 /* stack handoff to self - no thread_dispatch(), so clear the stack
3048 * and free the fakestack directly */
3049 kasan_fakestack_drop(self);
3050 kasan_fakestack_gc(self);
3051 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3052 #endif
3053
3054 self->continuation = self->parameter = NULL;
3055
3056 boolean_t enable_interrupts = TRUE;
3057
3058 /* idle thread needs to stay interrupts-disabled */
3059 if ((self->state & TH_IDLE)) {
3060 enable_interrupts = FALSE;
3061 }
3062
3063 call_continuation(continuation, parameter,
3064 self->wait_result, enable_interrupts);
3065 /*NOTREACHED*/
3066 }
3067 } else {
3068 /*
3069 * Check that the other thread has a stack
3070 */
3071 if (!thread->kernel_stack) {
3072 need_stack:
3073 if (!stack_alloc_try(thread)) {
3074 thread_unlock(thread);
3075 thread_stack_enqueue(thread);
3076 return FALSE;
3077 }
3078 } else if (thread == self) {
3079 ast_context(self);
3080 thread_unlock(self);
3081
3082 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3083 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3084 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3085
3086 return TRUE;
3087 }
3088 }
3089
3090 /*
3091 * Context switch by full context save.
3092 */
3093 processor_t processor = current_processor();
3094 processor->active_thread = thread;
3095 processor_state_update_from_thread(processor, thread, false);
3096
3097 if (thread->last_processor != processor && thread->last_processor != NULL) {
3098 if (thread->last_processor->processor_set != processor->processor_set) {
3099 thread->ps_switch++;
3100 }
3101 thread->p_switch++;
3102 }
3103 thread->last_processor = processor;
3104 thread->c_switch++;
3105 ast_context(thread);
3106
3107 thread_unlock(thread);
3108
3109 self->reason = reason;
3110
3111 processor->last_dispatch = ctime;
3112 self->last_run_time = ctime;
3113 processor_timer_switch_thread(ctime, &thread->system_timer);
3114 timer_update(&thread->runnable_timer, ctime);
3115 processor->kernel_timer = &thread->system_timer;
3116
3117 /*
3118 * Since non-precise user/kernel time doesn't update the state timer
3119 * during privilege transitions, synthesize an event now.
3120 */
3121 if (!thread->precise_user_kernel_time) {
3122 timer_update(processor->current_state, ctime);
3123 }
3124
3125 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3126 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3127 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3128
3129 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3130 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3131 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3132 }
3133
3134 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3135
3136 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3137
3138 #if KPERF
3139 kperf_off_cpu(self);
3140 #endif /* KPERF */
3141
3142 /*
3143 * This is where we actually switch register context,
3144 * and address space if required. We will next run
3145 * as a result of a subsequent context switch.
3146 *
3147 * Once registers are switched and the processor is running "thread",
3148 * the stack variables and non-volatile registers will contain whatever
3149 * was there the last time that thread blocked. No local variables should
3150 * be used after this point, except for the special case of "thread", which
3151 * the platform layer returns as the previous thread running on the processor
3152 * via the function call ABI as a return register, and "self", which may have
3153 * been stored on the stack or a non-volatile register, but a stale idea of
3154 * what was on the CPU is newly-accurate because that thread is again
3155 * running on the CPU.
3156 *
3157 * If one of the threads is using a continuation, thread_continue
3158 * is used to stitch up its context.
3159 *
3160 * If we are invoking a thread which is resuming from a continuation,
3161 * the CPU will invoke thread_continue next.
3162 *
3163 * If the current thread is parking in a continuation, then its state
3164 * won't be saved and the stack will be discarded. When the stack is
3165 * re-allocated, it will be configured to resume from thread_continue.
3166 */
3167 assert(continuation == self->continuation);
3168 thread = machine_switch_context(self, continuation, thread);
3169 assert(self == current_thread_volatile());
3170 TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3171
3172 assert(continuation == NULL && self->continuation == NULL);
3173
3174 DTRACE_SCHED(on__cpu);
3175
3176 #if KPERF
3177 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3178 #endif /* KPERF */
3179
3180 /* We have been resumed and are set to run. */
3181 thread_dispatch(thread, self);
3182
3183 return TRUE;
3184 }
3185
3186 #if defined(CONFIG_SCHED_DEFERRED_AST)
3187 /*
3188 * pset_cancel_deferred_dispatch:
3189 *
3190 * Cancels all ASTs that we can cancel for the given processor set
3191 * if the current processor is running the last runnable thread in the
3192 * system.
3193 *
3194 * This function assumes the current thread is runnable. This must
3195 * be called with the pset unlocked.
3196 */
3197 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3198 pset_cancel_deferred_dispatch(
3199 processor_set_t pset,
3200 processor_t processor)
3201 {
3202 processor_t active_processor = NULL;
3203 uint32_t sampled_sched_run_count;
3204
3205 pset_lock(pset);
3206 sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3207
3208 /*
3209 * If we have emptied the run queue, and our current thread is runnable, we
3210 * should tell any processors that are still DISPATCHING that they will
3211 * probably not have any work to do. In the event that there are no
3212 * pending signals that we can cancel, this is also uninteresting.
3213 *
3214 * In the unlikely event that another thread becomes runnable while we are
3215 * doing this (sched_run_count is atomically updated, not guarded), the
3216 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3217 * in order to dispatch it to a processor in our pset. So, the other
3218 * codepath will wait while we squash all cancelable ASTs, get the pset
3219 * lock, and then dispatch the freshly runnable thread. So this should be
3220 * correct (we won't accidentally have a runnable thread that hasn't been
3221 * dispatched to an idle processor), if not ideal (we may be restarting the
3222 * dispatch process, which could have some overhead).
3223 */
3224
3225 if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3226 uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3227 pset->pending_deferred_AST_cpu_mask &
3228 ~pset->pending_AST_URGENT_cpu_mask);
3229 for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3230 active_processor = processor_array[cpuid];
3231 /*
3232 * If a processor is DISPATCHING, it could be because of
3233 * a cancelable signal.
3234 *
3235 * IF the processor is not our
3236 * current processor (the current processor should not
3237 * be DISPATCHING, so this is a bit paranoid), AND there
3238 * is a cancelable signal pending on the processor, AND
3239 * there is no non-cancelable signal pending (as there is
3240 * no point trying to backtrack on bringing the processor
3241 * up if a signal we cannot cancel is outstanding), THEN
3242 * it should make sense to roll back the processor state
3243 * to the IDLE state.
3244 *
3245 * If the racey nature of this approach (as the signal
3246 * will be arbitrated by hardware, and can fire as we
3247 * roll back state) results in the core responding
3248 * despite being pushed back to the IDLE state, it
3249 * should be no different than if the core took some
3250 * interrupt while IDLE.
3251 */
3252 if (active_processor != processor) {
3253 /*
3254 * Squash all of the processor state back to some
3255 * reasonable facsimile of PROCESSOR_IDLE.
3256 */
3257
3258 processor_state_update_idle(active_processor);
3259 active_processor->deadline = RT_DEADLINE_NONE;
3260 pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3261 bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3262 machine_signal_idle_cancel(active_processor);
3263 }
3264 }
3265 }
3266
3267 pset_unlock(pset);
3268 }
3269 #else
3270 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3271 #endif
3272
3273 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3274 thread_csw_callout(
3275 thread_t old,
3276 thread_t new,
3277 uint64_t timestamp)
3278 {
3279 perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3280 uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3281 machine_switch_perfcontrol_context(event, timestamp, 0,
3282 same_pri_latency, old, new);
3283 }
3284
3285
3286 /*
3287 * thread_dispatch:
3288 *
3289 * Handle threads at context switch. Re-dispatch other thread
3290 * if still running, otherwise update run state and perform
3291 * special actions. Update quantum for other thread and begin
3292 * the quantum for ourselves.
3293 *
3294 * "thread" is the old thread that we have switched away from.
3295 * "self" is the new current thread that we have context switched to
3296 *
3297 * Called at splsched.
3298 *
3299 */
3300 void
thread_dispatch(thread_t thread,thread_t self)3301 thread_dispatch(
3302 thread_t thread,
3303 thread_t self)
3304 {
3305 processor_t processor = self->last_processor;
3306 bool was_idle = false;
3307
3308 assert(processor == current_processor());
3309 assert(self == current_thread_volatile());
3310 assert(thread != self);
3311
3312 if (thread != THREAD_NULL) {
3313 /*
3314 * Do the perfcontrol callout for context switch.
3315 * The reason we do this here is:
3316 * - thread_dispatch() is called from various places that are not
3317 * the direct context switch path for eg. processor shutdown etc.
3318 * So adding the callout here covers all those cases.
3319 * - We want this callout as early as possible to be close
3320 * to the timestamp taken in thread_invoke()
3321 * - We want to avoid holding the thread lock while doing the
3322 * callout
3323 * - We do not want to callout if "thread" is NULL.
3324 */
3325 thread_csw_callout(thread, self, processor->last_dispatch);
3326
3327 #if KASAN
3328 if (thread->continuation != NULL) {
3329 /*
3330 * Thread has a continuation and the normal stack is going away.
3331 * Unpoison the stack and mark all fakestack objects as unused.
3332 */
3333 kasan_fakestack_drop(thread);
3334 if (thread->kernel_stack) {
3335 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3336 }
3337 }
3338
3339 /*
3340 * Free all unused fakestack objects.
3341 */
3342 kasan_fakestack_gc(thread);
3343 #endif
3344
3345 /*
3346 * If blocked at a continuation, discard
3347 * the stack.
3348 */
3349 if (thread->continuation != NULL && thread->kernel_stack != 0) {
3350 stack_free(thread);
3351 }
3352
3353 if (thread->state & TH_IDLE) {
3354 was_idle = true;
3355 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3356 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3357 (uintptr_t)thread_tid(thread), 0, thread->state,
3358 sched_run_buckets[TH_BUCKET_RUN], 0);
3359 } else {
3360 int64_t consumed;
3361 int64_t remainder = 0;
3362
3363 if (processor->quantum_end > processor->last_dispatch) {
3364 remainder = processor->quantum_end -
3365 processor->last_dispatch;
3366 }
3367
3368 consumed = thread->quantum_remaining - remainder;
3369
3370 if ((thread->reason & AST_LEDGER) == 0) {
3371 /*
3372 * Bill CPU time to both the task and
3373 * the individual thread.
3374 */
3375 ledger_credit_thread(thread, thread->t_ledger,
3376 task_ledgers.cpu_time, consumed);
3377 ledger_credit_thread(thread, thread->t_threadledger,
3378 thread_ledgers.cpu_time, consumed);
3379 if (thread->t_bankledger) {
3380 ledger_credit_thread(thread, thread->t_bankledger,
3381 bank_ledgers.cpu_time,
3382 (consumed - thread->t_deduct_bank_ledger_time));
3383 }
3384 thread->t_deduct_bank_ledger_time = 0;
3385 if (consumed > 0) {
3386 /*
3387 * This should never be negative, but in traces we are seeing some instances
3388 * of consumed being negative.
3389 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3390 */
3391 sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3392 }
3393 }
3394
3395 /* For the thread that we just context switched away from, figure
3396 * out if we have expired the wq quantum and set the AST if we have
3397 */
3398 if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3399 thread_evaluate_workqueue_quantum_expiry(thread);
3400 }
3401
3402 wake_lock(thread);
3403 thread_lock(thread);
3404
3405 /*
3406 * Apply a priority floor if the thread holds a kernel resource
3407 * or explicitly requested it.
3408 * Do this before checking starting_pri to avoid overpenalizing
3409 * repeated rwlock blockers.
3410 */
3411 if (__improbable(thread->rwlock_count != 0)) {
3412 lck_rw_set_promotion_locked(thread);
3413 }
3414 if (__improbable(thread->priority_floor_count != 0)) {
3415 thread_floor_boost_set_promotion_locked(thread);
3416 }
3417
3418 boolean_t keep_quantum = processor->first_timeslice;
3419
3420 /*
3421 * Treat a thread which has dropped priority since it got on core
3422 * as having expired its quantum.
3423 */
3424 if (processor->starting_pri > thread->sched_pri) {
3425 keep_quantum = FALSE;
3426 }
3427
3428 /* Compute remainder of current quantum. */
3429 if (keep_quantum &&
3430 processor->quantum_end > processor->last_dispatch) {
3431 thread->quantum_remaining = (uint32_t)remainder;
3432 } else {
3433 thread->quantum_remaining = 0;
3434 }
3435
3436 if (thread->sched_mode == TH_MODE_REALTIME) {
3437 /*
3438 * Cancel the deadline if the thread has
3439 * consumed the entire quantum.
3440 */
3441 if (thread->quantum_remaining == 0) {
3442 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3443 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3444 thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3445 }
3446 } else {
3447 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3448 /*
3449 * For non-realtime threads treat a tiny
3450 * remaining quantum as an expired quantum
3451 * but include what's left next time.
3452 */
3453 if (thread->quantum_remaining < min_std_quantum) {
3454 thread->reason |= AST_QUANTUM;
3455 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3456 }
3457 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3458 }
3459
3460 /*
3461 * If we are doing a direct handoff then
3462 * take the remainder of the quantum.
3463 */
3464 if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3465 self->quantum_remaining = thread->quantum_remaining;
3466 thread->reason |= AST_QUANTUM;
3467 thread->quantum_remaining = 0;
3468 } else {
3469 #if defined(CONFIG_SCHED_MULTIQ)
3470 if (SCHED(sched_groups_enabled) &&
3471 thread->sched_group == self->sched_group) {
3472 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3473 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3474 self->reason, (uintptr_t)thread_tid(thread),
3475 self->quantum_remaining, thread->quantum_remaining, 0);
3476
3477 self->quantum_remaining = thread->quantum_remaining;
3478 thread->quantum_remaining = 0;
3479 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
3480 }
3481 #endif /* defined(CONFIG_SCHED_MULTIQ) */
3482 }
3483
3484 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3485
3486 if (!(thread->state & TH_WAIT)) {
3487 /*
3488 * Still runnable.
3489 */
3490 thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3491
3492 machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3493
3494 ast_t reason = thread->reason;
3495 sched_options_t options = SCHED_NONE;
3496
3497 if (reason & AST_REBALANCE) {
3498 options |= SCHED_REBALANCE;
3499 if (reason & AST_QUANTUM) {
3500 /*
3501 * Having gone to the trouble of forcing this thread off a less preferred core,
3502 * we should force the preferable core to reschedule immediately to give this
3503 * thread a chance to run instead of just sitting on the run queue where
3504 * it may just be stolen back by the idle core we just forced it off.
3505 * But only do this at the end of a quantum to prevent cascading effects.
3506 */
3507 options |= SCHED_PREEMPT;
3508 }
3509 }
3510
3511 if (reason & AST_QUANTUM) {
3512 options |= SCHED_TAILQ;
3513 } else if (reason & AST_PREEMPT) {
3514 options |= SCHED_HEADQ;
3515 } else {
3516 options |= (SCHED_PREEMPT | SCHED_TAILQ);
3517 }
3518
3519 thread_setrun(thread, options);
3520
3521 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3522 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3523 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3524 sched_run_buckets[TH_BUCKET_RUN], 0);
3525
3526 if (thread->wake_active) {
3527 thread->wake_active = FALSE;
3528 thread_unlock(thread);
3529
3530 thread_wakeup(&thread->wake_active);
3531 } else {
3532 thread_unlock(thread);
3533 }
3534
3535 wake_unlock(thread);
3536 } else {
3537 /*
3538 * Waiting.
3539 */
3540 boolean_t should_terminate = FALSE;
3541 uint32_t new_run_count;
3542 int thread_state = thread->state;
3543
3544 /* Only the first call to thread_dispatch
3545 * after explicit termination should add
3546 * the thread to the termination queue
3547 */
3548 if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3549 should_terminate = TRUE;
3550 thread_state |= TH_TERMINATE2;
3551 }
3552
3553 timer_stop(&thread->runnable_timer, processor->last_dispatch);
3554
3555 thread_state &= ~TH_RUN;
3556 thread->state = thread_state;
3557
3558 thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3559 thread->chosen_processor = PROCESSOR_NULL;
3560
3561 new_run_count = SCHED(run_count_decr)(thread);
3562
3563 #if CONFIG_SCHED_AUTO_JOIN
3564 if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3565 work_interval_auto_join_unwind(thread);
3566 }
3567 #endif /* CONFIG_SCHED_AUTO_JOIN */
3568
3569 #if CONFIG_SCHED_SFI
3570 if (thread->reason & AST_SFI) {
3571 thread->wait_sfi_begin_time = processor->last_dispatch;
3572 }
3573 #endif
3574 machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3575
3576 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3577 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3578 (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3579 new_run_count, 0);
3580
3581 if (thread_state & TH_WAIT_REPORT) {
3582 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3583 }
3584
3585 if (thread->wake_active) {
3586 thread->wake_active = FALSE;
3587 thread_unlock(thread);
3588
3589 thread_wakeup(&thread->wake_active);
3590 } else {
3591 thread_unlock(thread);
3592 }
3593
3594 wake_unlock(thread);
3595
3596 if (should_terminate) {
3597 thread_terminate_enqueue(thread);
3598 }
3599 }
3600 }
3601 /*
3602 * The thread could have been added to the termination queue, so it's
3603 * unsafe to use after this point.
3604 */
3605 thread = THREAD_NULL;
3606 }
3607
3608 int urgency = THREAD_URGENCY_NONE;
3609 uint64_t latency = 0;
3610
3611 /* Update (new) current thread and reprogram running timers */
3612 thread_lock(self);
3613
3614 if (!(self->state & TH_IDLE)) {
3615 uint64_t arg1, arg2;
3616
3617 #if CONFIG_SCHED_SFI
3618 ast_t new_ast;
3619
3620 new_ast = sfi_thread_needs_ast(self, NULL);
3621
3622 if (new_ast != AST_NONE) {
3623 ast_on(new_ast);
3624 }
3625 #endif
3626
3627 if (processor->last_dispatch < self->last_made_runnable_time) {
3628 panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3629 processor->last_dispatch, self->last_made_runnable_time);
3630 }
3631
3632 assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3633
3634 latency = processor->last_dispatch - self->last_made_runnable_time;
3635 assert(latency >= self->same_pri_latency);
3636
3637 urgency = thread_get_urgency(self, &arg1, &arg2);
3638
3639 thread_tell_urgency(urgency, arg1, arg2, latency, self);
3640
3641 /*
3642 * Get a new quantum if none remaining.
3643 */
3644 if (self->quantum_remaining == 0) {
3645 thread_quantum_init(self);
3646 }
3647
3648 /*
3649 * Set up quantum timer and timeslice.
3650 */
3651 processor->quantum_end = processor->last_dispatch +
3652 self->quantum_remaining;
3653
3654 running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
3655 processor->quantum_end, processor->last_dispatch);
3656 if (was_idle) {
3657 /*
3658 * kperf's running timer is active whenever the idle thread for a
3659 * CPU is not running.
3660 */
3661 kperf_running_setup(processor, processor->last_dispatch);
3662 }
3663 running_timers_activate(processor);
3664 processor->first_timeslice = TRUE;
3665 } else {
3666 running_timers_deactivate(processor);
3667 processor->first_timeslice = FALSE;
3668 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
3669 }
3670
3671 assert(self->block_hint == kThreadWaitNone);
3672 self->computation_epoch = processor->last_dispatch;
3673 self->reason = AST_NONE;
3674 processor->starting_pri = self->sched_pri;
3675
3676 thread_unlock(self);
3677
3678 machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
3679 processor->last_dispatch);
3680
3681 #if defined(CONFIG_SCHED_DEFERRED_AST)
3682 /*
3683 * TODO: Can we state that redispatching our old thread is also
3684 * uninteresting?
3685 */
3686 if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3687 pset_cancel_deferred_dispatch(processor->processor_set, processor);
3688 }
3689 #endif
3690 }
3691
3692 /*
3693 * thread_block_reason:
3694 *
3695 * Forces a reschedule, blocking the caller if a wait
3696 * has been asserted.
3697 *
3698 * If a continuation is specified, then thread_invoke will
3699 * attempt to discard the thread's kernel stack. When the
3700 * thread resumes, it will execute the continuation function
3701 * on a new kernel stack.
3702 */
3703 wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)3704 thread_block_reason(
3705 thread_continue_t continuation,
3706 void *parameter,
3707 ast_t reason)
3708 {
3709 thread_t self = current_thread();
3710 processor_t processor;
3711 thread_t new_thread;
3712 spl_t s;
3713
3714 s = splsched();
3715
3716 processor = current_processor();
3717
3718 /* If we're explicitly yielding, force a subsequent quantum */
3719 if (reason & AST_YIELD) {
3720 processor->first_timeslice = FALSE;
3721 }
3722
3723 /* We're handling all scheduling AST's */
3724 ast_off(AST_SCHEDULING);
3725
3726 #if PROC_REF_DEBUG
3727 if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3728 uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3729 }
3730 #endif
3731
3732 self->continuation = continuation;
3733 self->parameter = parameter;
3734
3735 if (self->state & ~(TH_RUN | TH_IDLE)) {
3736 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3737 MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3738 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3739 }
3740
3741 do {
3742 thread_lock(self);
3743 new_thread = thread_select(self, processor, &reason);
3744 thread_unlock(self);
3745 } while (!thread_invoke(self, new_thread, reason));
3746
3747 splx(s);
3748
3749 return self->wait_result;
3750 }
3751
3752 /*
3753 * thread_block:
3754 *
3755 * Block the current thread if a wait has been asserted.
3756 */
3757 wait_result_t
thread_block(thread_continue_t continuation)3758 thread_block(
3759 thread_continue_t continuation)
3760 {
3761 return thread_block_reason(continuation, NULL, AST_NONE);
3762 }
3763
3764 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)3765 thread_block_parameter(
3766 thread_continue_t continuation,
3767 void *parameter)
3768 {
3769 return thread_block_reason(continuation, parameter, AST_NONE);
3770 }
3771
3772 /*
3773 * thread_run:
3774 *
3775 * Switch directly from the current thread to the
3776 * new thread, handing off our quantum if appropriate.
3777 *
3778 * New thread must be runnable, and not on a run queue.
3779 *
3780 * Called at splsched.
3781 */
3782 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)3783 thread_run(
3784 thread_t self,
3785 thread_continue_t continuation,
3786 void *parameter,
3787 thread_t new_thread)
3788 {
3789 ast_t reason = AST_NONE;
3790
3791 if ((self->state & TH_IDLE) == 0) {
3792 reason = AST_HANDOFF;
3793 }
3794
3795 /*
3796 * If this thread hadn't been setrun'ed, it
3797 * might not have a chosen processor, so give it one
3798 */
3799 if (new_thread->chosen_processor == NULL) {
3800 new_thread->chosen_processor = current_processor();
3801 }
3802
3803 self->continuation = continuation;
3804 self->parameter = parameter;
3805
3806 while (!thread_invoke(self, new_thread, reason)) {
3807 /* the handoff failed, so we have to fall back to the normal block path */
3808 processor_t processor = current_processor();
3809
3810 reason = AST_NONE;
3811
3812 thread_lock(self);
3813 new_thread = thread_select(self, processor, &reason);
3814 thread_unlock(self);
3815 }
3816
3817 return self->wait_result;
3818 }
3819
3820 /*
3821 * thread_continue:
3822 *
3823 * Called at splsched when a thread first receives
3824 * a new stack after a continuation.
3825 *
3826 * Called with THREAD_NULL as the old thread when
3827 * invoked by machine_load_context.
3828 */
3829 void
thread_continue(thread_t thread)3830 thread_continue(
3831 thread_t thread)
3832 {
3833 thread_t self = current_thread();
3834 thread_continue_t continuation;
3835 void *parameter;
3836
3837 DTRACE_SCHED(on__cpu);
3838
3839 continuation = self->continuation;
3840 parameter = self->parameter;
3841
3842 assert(continuation != NULL);
3843
3844 #if KPERF
3845 kperf_on_cpu(self, continuation, NULL);
3846 #endif
3847
3848 thread_dispatch(thread, self);
3849
3850 self->continuation = self->parameter = NULL;
3851
3852 #if INTERRUPT_MASKED_DEBUG
3853 /* Reset interrupt-masked spin debugging timeout */
3854 ml_spin_debug_clear(self);
3855 #endif
3856
3857 TLOG(1, "thread_continue: calling call_continuation\n");
3858
3859 boolean_t enable_interrupts = TRUE;
3860
3861 /* bootstrap thread, idle thread need to stay interrupts-disabled */
3862 if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
3863 enable_interrupts = FALSE;
3864 }
3865
3866 call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
3867 /*NOTREACHED*/
3868 }
3869
3870 void
thread_quantum_init(thread_t thread)3871 thread_quantum_init(thread_t thread)
3872 {
3873 if (thread->sched_mode == TH_MODE_REALTIME) {
3874 thread->quantum_remaining = thread->realtime.computation;
3875 } else {
3876 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
3877 }
3878 }
3879
3880 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)3881 sched_timeshare_initial_quantum_size(thread_t thread)
3882 {
3883 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
3884 return bg_quantum;
3885 } else {
3886 return std_quantum;
3887 }
3888 }
3889
3890 /*
3891 * run_queue_init:
3892 *
3893 * Initialize a run queue before first use.
3894 */
3895 void
run_queue_init(run_queue_t rq)3896 run_queue_init(
3897 run_queue_t rq)
3898 {
3899 rq->highq = NOPRI;
3900 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
3901 rq->bitmap[i] = 0;
3902 }
3903 rq->urgency = rq->count = 0;
3904 for (int i = 0; i < NRQS; i++) {
3905 circle_queue_init(&rq->queues[i]);
3906 }
3907 }
3908
3909 /*
3910 * run_queue_dequeue:
3911 *
3912 * Perform a dequeue operation on a run queue,
3913 * and return the resulting thread.
3914 *
3915 * The run queue must be locked (see thread_run_queue_remove()
3916 * for more info), and not empty.
3917 */
3918 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)3919 run_queue_dequeue(
3920 run_queue_t rq,
3921 sched_options_t options)
3922 {
3923 thread_t thread;
3924 circle_queue_t queue = &rq->queues[rq->highq];
3925
3926 if (options & SCHED_HEADQ) {
3927 thread = cqe_dequeue_head(queue, struct thread, runq_links);
3928 } else {
3929 thread = cqe_dequeue_tail(queue, struct thread, runq_links);
3930 }
3931
3932 assert(thread != THREAD_NULL);
3933 assert_thread_magic(thread);
3934
3935 thread->runq = PROCESSOR_NULL;
3936 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3937 rq->count--;
3938 if (SCHED(priority_is_urgent)(rq->highq)) {
3939 rq->urgency--; assert(rq->urgency >= 0);
3940 }
3941 if (circle_queue_empty(queue)) {
3942 bitmap_clear(rq->bitmap, rq->highq);
3943 rq->highq = bitmap_first(rq->bitmap, NRQS);
3944 }
3945
3946 return thread;
3947 }
3948
3949 /*
3950 * run_queue_enqueue:
3951 *
3952 * Perform a enqueue operation on a run queue.
3953 *
3954 * The run queue must be locked (see thread_run_queue_remove()
3955 * for more info).
3956 */
3957 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)3958 run_queue_enqueue(
3959 run_queue_t rq,
3960 thread_t thread,
3961 sched_options_t options)
3962 {
3963 circle_queue_t queue = &rq->queues[thread->sched_pri];
3964 boolean_t result = FALSE;
3965
3966 assert_thread_magic(thread);
3967
3968 if (circle_queue_empty(queue)) {
3969 circle_enqueue_tail(queue, &thread->runq_links);
3970
3971 rq_bitmap_set(rq->bitmap, thread->sched_pri);
3972 if (thread->sched_pri > rq->highq) {
3973 rq->highq = thread->sched_pri;
3974 result = TRUE;
3975 }
3976 } else {
3977 if (options & SCHED_TAILQ) {
3978 circle_enqueue_tail(queue, &thread->runq_links);
3979 } else {
3980 circle_enqueue_head(queue, &thread->runq_links);
3981 }
3982 }
3983 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3984 rq->urgency++;
3985 }
3986 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3987 rq->count++;
3988
3989 return result;
3990 }
3991
3992 /*
3993 * run_queue_remove:
3994 *
3995 * Remove a specific thread from a runqueue.
3996 *
3997 * The run queue must be locked.
3998 */
3999 void
run_queue_remove(run_queue_t rq,thread_t thread)4000 run_queue_remove(
4001 run_queue_t rq,
4002 thread_t thread)
4003 {
4004 circle_queue_t queue = &rq->queues[thread->sched_pri];
4005
4006 assert(thread->runq != PROCESSOR_NULL);
4007 assert_thread_magic(thread);
4008
4009 circle_dequeue(queue, &thread->runq_links);
4010 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4011 rq->count--;
4012 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4013 rq->urgency--; assert(rq->urgency >= 0);
4014 }
4015
4016 if (circle_queue_empty(queue)) {
4017 /* update run queue status */
4018 bitmap_clear(rq->bitmap, thread->sched_pri);
4019 rq->highq = bitmap_first(rq->bitmap, NRQS);
4020 }
4021
4022 thread->runq = PROCESSOR_NULL;
4023 }
4024
4025 /*
4026 * run_queue_peek
4027 *
4028 * Peek at the runq and return the highest
4029 * priority thread from the runq.
4030 *
4031 * The run queue must be locked.
4032 */
4033 thread_t
run_queue_peek(run_queue_t rq)4034 run_queue_peek(
4035 run_queue_t rq)
4036 {
4037 if (rq->count > 0) {
4038 circle_queue_t queue = &rq->queues[rq->highq];
4039 thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4040 assert_thread_magic(thread);
4041 return thread;
4042 } else {
4043 return THREAD_NULL;
4044 }
4045 }
4046
4047 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)4048 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4049 {
4050 int pri = thread->sched_pri;
4051 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4052 int i = pri - BASEPRI_RTQUEUES;
4053 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4054 bitmap_t *map = rt_run_queue->bitmap;
4055
4056 bitmap_set(map, i);
4057
4058 queue_t queue = &rt_runq->pri_queue;
4059 uint64_t deadline = thread->realtime.deadline;
4060 bool preempt = false;
4061 bool earliest = false;
4062
4063 if (queue_empty(queue)) {
4064 enqueue_tail(queue, &thread->runq_links);
4065 preempt = true;
4066 earliest = true;
4067 rt_runq->pri_earliest_deadline = deadline;
4068 rt_runq->pri_constraint = thread->realtime.constraint;
4069 } else {
4070 /* Insert into rt_runq in thread deadline order */
4071 queue_entry_t iter;
4072 qe_foreach(iter, queue) {
4073 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4074 assert_thread_magic(iter_thread);
4075
4076 if (deadline < iter_thread->realtime.deadline) {
4077 if (iter == queue_first(queue)) {
4078 preempt = true;
4079 earliest = true;
4080 rt_runq->pri_earliest_deadline = deadline;
4081 rt_runq->pri_constraint = thread->realtime.constraint;
4082 }
4083 insque(&thread->runq_links, queue_prev(iter));
4084 break;
4085 } else if (iter == queue_last(queue)) {
4086 enqueue_tail(queue, &thread->runq_links);
4087 break;
4088 }
4089 }
4090 }
4091 if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4092 os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4093 os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4094 os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4095 }
4096
4097 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4098 rt_runq->pri_count++;
4099 os_atomic_inc(&rt_run_queue->count, relaxed);
4100
4101 thread->runq = processor;
4102
4103 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4104
4105 return preempt;
4106 }
4107
4108 static thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)4109 rt_runq_dequeue(rt_queue_t rt_run_queue)
4110 {
4111 bitmap_t *map = rt_run_queue->bitmap;
4112 int i = bitmap_first(map, NRTQS);
4113 assert((i >= 0) && (i < NRTQS));
4114
4115 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4116
4117 if (!sched_rt_runq_strict_priority) {
4118 int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4119 if (ed_index != i) {
4120 assert((ed_index >= 0) && (ed_index < NRTQS));
4121 rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4122
4123 thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4124 thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4125
4126 if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4127 /* choose the earliest deadline thread */
4128 rt_runq = ed_runq;
4129 i = ed_index;
4130 }
4131 }
4132 }
4133
4134 assert(rt_runq->pri_count > 0);
4135 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4136 uint32_t constraint = RT_CONSTRAINT_NONE;
4137 int ed_index = NOPRI;
4138 thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4139 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4140 if (--rt_runq->pri_count > 0) {
4141 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4142 assert(next_rt != THREAD_NULL);
4143 earliest_deadline = next_rt->realtime.deadline;
4144 constraint = next_rt->realtime.constraint;
4145 ed_index = i;
4146 } else {
4147 bitmap_clear(map, i);
4148 }
4149 rt_runq->pri_earliest_deadline = earliest_deadline;
4150 rt_runq->pri_constraint = constraint;
4151
4152 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4153 rt_runq = &rt_run_queue->rt_queue_pri[i];
4154 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4155 earliest_deadline = rt_runq->pri_earliest_deadline;
4156 constraint = rt_runq->pri_constraint;
4157 ed_index = i;
4158 }
4159 }
4160 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4161 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4162 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4163 os_atomic_dec(&rt_run_queue->count, relaxed);
4164
4165 new_thread->runq = PROCESSOR_NULL;
4166
4167 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4168
4169 return new_thread;
4170 }
4171
4172 static thread_t
rt_runq_first(rt_queue_t rt_run_queue)4173 rt_runq_first(rt_queue_t rt_run_queue)
4174 {
4175 bitmap_t *map = rt_run_queue->bitmap;
4176 int i = bitmap_first(map, NRTQS);
4177 if (i < 0) {
4178 return THREAD_NULL;
4179 }
4180 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4181 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4182
4183 return next_rt;
4184 }
4185
4186 static void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)4187 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4188 {
4189 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4190
4191 int pri = thread->sched_pri;
4192 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4193 int i = pri - BASEPRI_RTQUEUES;
4194 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4195 bitmap_t *map = rt_run_queue->bitmap;
4196
4197 assert(rt_runq->pri_count > 0);
4198 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4199 uint32_t constraint = RT_CONSTRAINT_NONE;
4200 int ed_index = NOPRI;
4201 remqueue(&thread->runq_links);
4202 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4203 if (--rt_runq->pri_count > 0) {
4204 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4205 earliest_deadline = next_rt->realtime.deadline;
4206 constraint = next_rt->realtime.constraint;
4207 ed_index = i;
4208 } else {
4209 bitmap_clear(map, i);
4210 }
4211 rt_runq->pri_earliest_deadline = earliest_deadline;
4212 rt_runq->pri_constraint = constraint;
4213
4214 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4215 rt_runq = &rt_run_queue->rt_queue_pri[i];
4216 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4217 earliest_deadline = rt_runq->pri_earliest_deadline;
4218 constraint = rt_runq->pri_constraint;
4219 ed_index = i;
4220 }
4221 }
4222 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4223 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4224 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4225 os_atomic_dec(&rt_run_queue->count, relaxed);
4226
4227 thread->runq = PROCESSOR_NULL;
4228
4229 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4230 }
4231
4232 rt_queue_t
sched_rtlocal_runq(processor_set_t pset)4233 sched_rtlocal_runq(processor_set_t pset)
4234 {
4235 return &pset->rt_runq;
4236 }
4237
4238 void
sched_rtlocal_init(processor_set_t pset)4239 sched_rtlocal_init(processor_set_t pset)
4240 {
4241 pset_rt_init(pset);
4242 }
4243
4244 void
sched_rtlocal_queue_shutdown(processor_t processor)4245 sched_rtlocal_queue_shutdown(processor_t processor)
4246 {
4247 processor_set_t pset = processor->processor_set;
4248 thread_t thread;
4249 queue_head_t tqueue;
4250
4251 pset_lock(pset);
4252
4253 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4254 if (bit_count(pset_available_cpumap(pset)) > 0) {
4255 pset_unlock(pset);
4256 return;
4257 }
4258
4259 queue_init(&tqueue);
4260
4261 while (rt_runq_count(pset) > 0) {
4262 thread = rt_runq_dequeue(&pset->rt_runq);
4263 enqueue_tail(&tqueue, &thread->runq_links);
4264 }
4265 sched_update_pset_load_average(pset, 0);
4266 pset_update_rt_stealable_state(pset);
4267 pset_unlock(pset);
4268
4269 qe_foreach_element_safe(thread, &tqueue, runq_links) {
4270 remqueue(&thread->runq_links);
4271
4272 thread_lock(thread);
4273
4274 thread_setrun(thread, SCHED_TAILQ);
4275
4276 thread_unlock(thread);
4277 }
4278 }
4279
4280 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4281 void
sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)4282 sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4283 {
4284 thread_t thread;
4285
4286 pset_node_t node = &pset_node0;
4287 processor_set_t pset = node->psets;
4288
4289 spl_t s = splsched();
4290 do {
4291 while (pset != NULL) {
4292 pset_lock(pset);
4293
4294 bitmap_t *map = pset->rt_runq.bitmap;
4295 for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4296 rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4297
4298 qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4299 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4300 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4301 }
4302 }
4303 }
4304
4305 pset_unlock(pset);
4306
4307 pset = pset->pset_list;
4308 }
4309 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4310 splx(s);
4311 }
4312
4313 int64_t
sched_rtlocal_runq_count_sum(void)4314 sched_rtlocal_runq_count_sum(void)
4315 {
4316 pset_node_t node = &pset_node0;
4317 processor_set_t pset = node->psets;
4318 int64_t count = 0;
4319
4320 do {
4321 while (pset != NULL) {
4322 count += pset->rt_runq.runq_stats.count_sum;
4323
4324 pset = pset->pset_list;
4325 }
4326 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4327
4328 return count;
4329 }
4330
4331 /*
4332 * Called with stealing_pset locked and
4333 * returns with stealing_pset locked
4334 * but the lock will have been dropped
4335 * if a thread is returned.
4336 */
4337 thread_t
sched_rtlocal_steal_thread(processor_set_t stealing_pset,uint64_t earliest_deadline)4338 sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4339 {
4340 if (!sched_allow_rt_steal) {
4341 return THREAD_NULL;
4342 }
4343 pset_map_t pset_map = stealing_pset->node->pset_map;
4344
4345 bit_clear(pset_map, stealing_pset->pset_id);
4346
4347 processor_set_t pset = stealing_pset;
4348
4349 processor_set_t target_pset;
4350 uint64_t target_deadline;
4351
4352 retry:
4353 target_pset = NULL;
4354 target_deadline = earliest_deadline - rt_deadline_epsilon;
4355
4356 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
4357 processor_set_t nset = pset_array[pset_id];
4358
4359 if (nset->stealable_rt_threads_earliest_deadline < target_deadline) {
4360 target_deadline = nset->stealable_rt_threads_earliest_deadline;
4361 target_pset = nset;
4362 }
4363 }
4364
4365 if (target_pset != NULL) {
4366 pset = change_locked_pset(pset, target_pset);
4367 if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4368 thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
4369 pset_update_rt_stealable_state(pset);
4370 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4371
4372 pset = change_locked_pset(pset, stealing_pset);
4373 return new_thread;
4374 }
4375 pset = change_locked_pset(pset, stealing_pset);
4376 earliest_deadline = rt_runq_earliest_deadline(pset);
4377 goto retry;
4378 }
4379
4380 pset = change_locked_pset(pset, stealing_pset);
4381 return THREAD_NULL;
4382 }
4383
4384 /*
4385 * pset is locked
4386 */
4387 thread_t
sched_rt_choose_thread(processor_set_t pset)4388 sched_rt_choose_thread(processor_set_t pset)
4389 {
4390 processor_t processor = current_processor();
4391 uint64_t rt_ll_deadline = 0;
4392 if (rt_constraint_ll != 0) {
4393 rt_ll_deadline = rt_constraint_ll + mach_absolute_time();
4394 }
4395
4396 if (rt_runq_earliest_deadline(pset) < rt_ll_deadline) {
4397 thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4398 pset_update_rt_stealable_state(pset);
4399 assert(new_thread != THREAD_NULL);
4400 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4401 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 1);
4402 }
4403 return new_thread;
4404 }
4405
4406 if (SCHED(steal_thread_enabled)(pset)) {
4407 do {
4408 bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4409 if (spill_pending) {
4410 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4411 }
4412 thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4413 if (new_thread != THREAD_NULL) {
4414 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4415 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4416 }
4417 return new_thread;
4418 }
4419 } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4420 }
4421
4422 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4423 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4424 }
4425
4426 if (rt_runq_count(pset) > 0) {
4427 thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4428 assert(new_thread != THREAD_NULL);
4429 pset_update_rt_stealable_state(pset);
4430 return new_thread;
4431 }
4432
4433 return THREAD_NULL;
4434 }
4435
4436 /*
4437 * realtime_queue_insert:
4438 *
4439 * Enqueue a thread for realtime execution.
4440 */
4441 static bool
realtime_queue_insert(processor_t processor,processor_set_t pset,thread_t thread)4442 realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4443 {
4444 pset_assert_locked(pset);
4445
4446 bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4447 pset_update_rt_stealable_state(pset);
4448
4449 return preempt;
4450 }
4451
4452 /*
4453 * realtime_setrun:
4454 *
4455 * Dispatch a thread for realtime execution.
4456 *
4457 * Thread must be locked. Associated pset must
4458 * be locked, and is returned unlocked.
4459 */
4460 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4461 realtime_setrun(
4462 processor_t chosen_processor,
4463 thread_t thread)
4464 {
4465 processor_set_t pset = chosen_processor->processor_set;
4466 pset_assert_locked(pset);
4467 bool pset_is_locked = true;
4468
4469 int n_backup = 0;
4470
4471 if (thread->realtime.constraint <= rt_constraint_threshold) {
4472 n_backup = sched_rt_n_backup_processors;
4473 }
4474 assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4475
4476 int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4477 if (existing_backups > 0) {
4478 n_backup = n_backup - existing_backups;
4479 if (n_backup < 0) {
4480 n_backup = 0;
4481 }
4482 }
4483
4484 sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4485 processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4486
4487 thread->chosen_processor = chosen_processor;
4488
4489 /* <rdar://problem/15102234> */
4490 assert(thread->bound_processor == PROCESSOR_NULL);
4491
4492 realtime_queue_insert(chosen_processor, pset, thread);
4493
4494 processor_t processor = chosen_processor;
4495
4496 int count = 0;
4497 for (int i = 0; i <= n_backup; i++) {
4498 if (i == 0) {
4499 ipi_type[i] = SCHED_IPI_NONE;
4500 ipi_processor[i] = processor;
4501 count++;
4502
4503 ast_t preempt = AST_NONE;
4504 if (thread->sched_pri > processor->current_pri) {
4505 preempt = (AST_PREEMPT | AST_URGENT);
4506 } else if (thread->sched_pri == processor->current_pri) {
4507 if (thread->realtime.constraint <= rt_constraint_ll) {
4508 preempt = (AST_PREEMPT | AST_URGENT);
4509 } else if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4510 preempt = (AST_PREEMPT | AST_URGENT);
4511 }
4512 }
4513
4514 if (preempt != AST_NONE) {
4515 if (processor->state == PROCESSOR_IDLE) {
4516 if (processor == current_processor()) {
4517 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4518 ast_on(preempt);
4519
4520 if ((preempt & AST_URGENT) == AST_URGENT) {
4521 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4522 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4523 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4524 }
4525 }
4526
4527 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4528 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4529 }
4530 } else {
4531 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4532 }
4533 } else if (processor->state == PROCESSOR_DISPATCHING) {
4534 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4535 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4536 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4537 }
4538 } else {
4539 if (processor == current_processor()) {
4540 ast_on(preempt);
4541
4542 if ((preempt & AST_URGENT) == AST_URGENT) {
4543 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4544 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4545 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4546 }
4547 }
4548
4549 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4550 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4551 }
4552 } else {
4553 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4554 }
4555 }
4556 } else {
4557 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4558 }
4559 } else {
4560 if (!pset_is_locked) {
4561 pset_lock(pset);
4562 }
4563 ipi_type[i] = SCHED_IPI_NONE;
4564 ipi_processor[i] = PROCESSOR_NULL;
4565 pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]);
4566 if (ipi_processor[i] == PROCESSOR_NULL) {
4567 break;
4568 }
4569 count++;
4570
4571 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4572 ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4573 #if defined(__x86_64__)
4574 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4575 if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4576 processor_t p0 = ipi_processor[0];
4577 processor_t p1 = ipi_processor[1];
4578 assert(p0 && p1);
4579 if (p_is_good(p0) && p_is_good(p1)) {
4580 /*
4581 * Both the chosen processor and the first backup are non-cpu0 primaries,
4582 * so there is no need for a 2nd backup processor.
4583 */
4584 break;
4585 }
4586 }
4587 #endif
4588 }
4589 }
4590
4591 if (pset_is_locked) {
4592 pset_unlock(pset);
4593 }
4594
4595 assert((count > 0) && (count <= (n_backup + 1)));
4596 for (int i = 0; i < count; i++) {
4597 assert(ipi_processor[i] != PROCESSOR_NULL);
4598 sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4599 }
4600 }
4601
4602
4603 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)4604 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4605 thread_t thread, __unused sched_ipi_event_t event)
4606 {
4607 #if defined(CONFIG_SCHED_DEFERRED_AST)
4608 #if CONFIG_THREAD_GROUPS
4609 if (thread) {
4610 struct thread_group *tg = thread_group_get(thread);
4611 if (thread_group_uses_immediate_ipi(tg)) {
4612 return SCHED_IPI_IMMEDIATE;
4613 }
4614 }
4615 #endif /* CONFIG_THREAD_GROUPS */
4616 if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4617 return SCHED_IPI_DEFERRED;
4618 }
4619 #else /* CONFIG_SCHED_DEFERRED_AST */
4620 (void) thread;
4621 panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4622 #endif /* CONFIG_SCHED_DEFERRED_AST */
4623 return SCHED_IPI_NONE;
4624 }
4625
4626 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)4627 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4628 {
4629 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4630 assert(dst != NULL);
4631
4632 processor_set_t pset = dst->processor_set;
4633 if (current_processor() == dst) {
4634 return SCHED_IPI_NONE;
4635 }
4636
4637 if (bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4638 return SCHED_IPI_NONE;
4639 }
4640
4641 bool dst_idle = (dst->state == PROCESSOR_IDLE);
4642 if (dst_idle) {
4643 pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
4644 }
4645
4646 ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4647 switch (ipi_type) {
4648 case SCHED_IPI_NONE:
4649 return SCHED_IPI_NONE;
4650 #if defined(CONFIG_SCHED_DEFERRED_AST)
4651 case SCHED_IPI_DEFERRED:
4652 bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4653 break;
4654 #endif /* CONFIG_SCHED_DEFERRED_AST */
4655 default:
4656 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4657 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4658 dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4659 }
4660 bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4661 break;
4662 }
4663 return ipi_type;
4664 }
4665
4666 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4667 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4668 {
4669 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4670 boolean_t deferred_ipi_supported = false;
4671 processor_set_t pset = dst->processor_set;
4672
4673 #if defined(CONFIG_SCHED_DEFERRED_AST)
4674 deferred_ipi_supported = true;
4675 #endif /* CONFIG_SCHED_DEFERRED_AST */
4676
4677 switch (event) {
4678 case SCHED_IPI_EVENT_SPILL:
4679 case SCHED_IPI_EVENT_SMT_REBAL:
4680 case SCHED_IPI_EVENT_REBALANCE:
4681 case SCHED_IPI_EVENT_BOUND_THR:
4682 case SCHED_IPI_EVENT_RT_PREEMPT:
4683 /*
4684 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4685 * scenarios use immediate IPIs always.
4686 */
4687 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4688 break;
4689 case SCHED_IPI_EVENT_PREEMPT:
4690 /* In the preemption case, use immediate IPIs for RT threads */
4691 if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4692 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4693 break;
4694 }
4695
4696 /*
4697 * For Non-RT threads preemption,
4698 * If the core is active, use immediate IPIs.
4699 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4700 */
4701 if (deferred_ipi_supported && dst_idle) {
4702 return sched_ipi_deferred_policy(pset, dst, thread, event);
4703 }
4704 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4705 break;
4706 default:
4707 panic("Unrecognized scheduler IPI event type %d", event);
4708 }
4709 assert(ipi_type != SCHED_IPI_NONE);
4710 return ipi_type;
4711 }
4712
4713 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)4714 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4715 {
4716 switch (ipi) {
4717 case SCHED_IPI_NONE:
4718 break;
4719 case SCHED_IPI_IDLE:
4720 machine_signal_idle(dst);
4721 break;
4722 case SCHED_IPI_IMMEDIATE:
4723 cause_ast_check(dst);
4724 break;
4725 case SCHED_IPI_DEFERRED:
4726 machine_signal_idle_deferred(dst);
4727 break;
4728 default:
4729 panic("Unrecognized scheduler IPI type: %d", ipi);
4730 }
4731 }
4732
4733 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4734
4735 boolean_t
priority_is_urgent(int priority)4736 priority_is_urgent(int priority)
4737 {
4738 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
4739 }
4740
4741 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4742
4743 /*
4744 * processor_setrun:
4745 *
4746 * Dispatch a thread for execution on a
4747 * processor.
4748 *
4749 * Thread must be locked. Associated pset must
4750 * be locked, and is returned unlocked.
4751 */
4752 static void
processor_setrun(processor_t processor,thread_t thread,integer_t options)4753 processor_setrun(
4754 processor_t processor,
4755 thread_t thread,
4756 integer_t options)
4757 {
4758 processor_set_t pset = processor->processor_set;
4759 pset_assert_locked(pset);
4760 ast_t preempt;
4761 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4762
4763 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4764
4765 thread->chosen_processor = processor;
4766
4767 /*
4768 * Set preemption mode.
4769 */
4770 #if defined(CONFIG_SCHED_DEFERRED_AST)
4771 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4772 #endif
4773 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4774 preempt = (AST_PREEMPT | AST_URGENT);
4775 } else if (processor->current_is_eagerpreempt) {
4776 preempt = (AST_PREEMPT | AST_URGENT);
4777 } else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4778 if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4779 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4780 } else {
4781 preempt = AST_NONE;
4782 }
4783 } else {
4784 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4785 }
4786
4787 if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
4788 /*
4789 * Having gone to the trouble of forcing this thread off a less preferred core,
4790 * we should force the preferable core to reschedule immediately to give this
4791 * thread a chance to run instead of just sitting on the run queue where
4792 * it may just be stolen back by the idle core we just forced it off.
4793 */
4794 preempt |= AST_PREEMPT;
4795 }
4796
4797 SCHED(processor_enqueue)(processor, thread, options);
4798 sched_update_pset_load_average(pset, 0);
4799
4800 if (preempt != AST_NONE) {
4801 if (processor->state == PROCESSOR_IDLE) {
4802 ipi_action = eExitIdle;
4803 } else if (processor->state == PROCESSOR_DISPATCHING) {
4804 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4805 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4806 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4807 }
4808 } else if ((processor->state == PROCESSOR_RUNNING ||
4809 processor->state == PROCESSOR_SHUTDOWN) &&
4810 (thread->sched_pri >= processor->current_pri)) {
4811 ipi_action = eInterruptRunning;
4812 }
4813 } else {
4814 /*
4815 * New thread is not important enough to preempt what is running, but
4816 * special processor states may need special handling
4817 */
4818 if (processor->state == PROCESSOR_SHUTDOWN &&
4819 thread->sched_pri >= processor->current_pri) {
4820 ipi_action = eInterruptRunning;
4821 } else if (processor->state == PROCESSOR_IDLE) {
4822 ipi_action = eExitIdle;
4823 } else if (processor->state == PROCESSOR_DISPATCHING) {
4824 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4825 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4826 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
4827 }
4828 }
4829 }
4830
4831 if (ipi_action != eDoNothing) {
4832 if (processor == current_processor()) {
4833 if (ipi_action == eExitIdle) {
4834 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4835 }
4836 if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
4837 ast_on(preempt);
4838 }
4839
4840 if ((preempt & AST_URGENT) == AST_URGENT) {
4841 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4842 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4843 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
4844 }
4845 } else {
4846 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4847 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
4848 }
4849 }
4850
4851 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4852 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4853 } else {
4854 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4855 }
4856 } else {
4857 sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
4858 ipi_type = sched_ipi_action(processor, thread, event);
4859 }
4860 }
4861 pset_unlock(pset);
4862 sched_ipi_perform(processor, ipi_type);
4863 }
4864
4865 /*
4866 * choose_next_pset:
4867 *
4868 * Return the next sibling pset containing
4869 * available processors.
4870 *
4871 * Returns the original pset if none other is
4872 * suitable.
4873 */
4874 static processor_set_t
choose_next_pset(processor_set_t pset)4875 choose_next_pset(
4876 processor_set_t pset)
4877 {
4878 processor_set_t nset = pset;
4879
4880 do {
4881 nset = next_pset(nset);
4882 } while (nset->online_processor_count < 1 && nset != pset);
4883
4884 return nset;
4885 }
4886
4887 /*
4888 * choose_processor:
4889 *
4890 * Choose a processor for the thread, beginning at
4891 * the pset. Accepts an optional processor hint in
4892 * the pset.
4893 *
4894 * Returns a processor, possibly from a different pset.
4895 *
4896 * The thread must be locked. The pset must be locked,
4897 * and the resulting pset is locked on return.
4898 */
4899 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread)4900 choose_processor(
4901 processor_set_t starting_pset,
4902 processor_t processor,
4903 thread_t thread)
4904 {
4905 processor_set_t pset = starting_pset;
4906 processor_set_t nset;
4907
4908 assert(thread->sched_pri <= MAXPRI);
4909
4910 /*
4911 * Prefer the hinted processor, when appropriate.
4912 */
4913
4914 /* Fold last processor hint from secondary processor to its primary */
4915 if (processor != PROCESSOR_NULL) {
4916 processor = processor->processor_primary;
4917 }
4918
4919 /*
4920 * Only consult platform layer if pset is active, which
4921 * it may not be in some cases when a multi-set system
4922 * is going to sleep.
4923 */
4924 if (pset->online_processor_count) {
4925 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
4926 processor_t mc_processor = machine_choose_processor(pset, processor);
4927 if (mc_processor != PROCESSOR_NULL) {
4928 processor = mc_processor->processor_primary;
4929 }
4930 }
4931 }
4932
4933 /*
4934 * At this point, we may have a processor hint, and we may have
4935 * an initial starting pset. If the hint is not in the pset, or
4936 * if the hint is for a processor in an invalid state, discard
4937 * the hint.
4938 */
4939 if (processor != PROCESSOR_NULL) {
4940 if (processor->processor_set != pset) {
4941 processor = PROCESSOR_NULL;
4942 } else if (!processor->is_recommended) {
4943 processor = PROCESSOR_NULL;
4944 } else {
4945 switch (processor->state) {
4946 case PROCESSOR_START:
4947 case PROCESSOR_SHUTDOWN:
4948 case PROCESSOR_OFF_LINE:
4949 /*
4950 * Hint is for a processor that cannot support running new threads.
4951 */
4952 processor = PROCESSOR_NULL;
4953 break;
4954 case PROCESSOR_IDLE:
4955 /*
4956 * Hint is for an idle processor. Assume it is no worse than any other
4957 * idle processor. The platform layer had an opportunity to provide
4958 * the "least cost idle" processor above.
4959 */
4960 if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
4961 return processor;
4962 }
4963 processor = PROCESSOR_NULL;
4964 break;
4965 case PROCESSOR_RUNNING:
4966 case PROCESSOR_DISPATCHING:
4967 /*
4968 * Hint is for an active CPU. This fast-path allows
4969 * realtime threads to preempt non-realtime threads
4970 * to regain their previous executing processor.
4971 */
4972 if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
4973 processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
4974 return processor;
4975 }
4976
4977 /* Otherwise, use hint as part of search below */
4978 break;
4979 default:
4980 processor = PROCESSOR_NULL;
4981 break;
4982 }
4983 }
4984 }
4985
4986 /*
4987 * Iterate through the processor sets to locate
4988 * an appropriate processor. Seed results with
4989 * a last-processor hint, if available, so that
4990 * a search must find something strictly better
4991 * to replace it.
4992 *
4993 * A primary/secondary pair of SMT processors are
4994 * "unpaired" if the primary is busy but its
4995 * corresponding secondary is idle (so the physical
4996 * core has full use of its resources).
4997 */
4998
4999 integer_t lowest_priority = MAXPRI + 1;
5000 integer_t lowest_secondary_priority = MAXPRI + 1;
5001 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
5002 integer_t lowest_idle_secondary_priority = MAXPRI + 1;
5003 integer_t lowest_count = INT_MAX;
5004 uint64_t furthest_deadline = 1;
5005 processor_t lp_processor = PROCESSOR_NULL;
5006 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
5007 processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
5008 processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
5009 processor_t lc_processor = PROCESSOR_NULL;
5010 processor_t fd_processor = PROCESSOR_NULL;
5011
5012 if (processor != PROCESSOR_NULL) {
5013 /* All other states should be enumerated above. */
5014 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5015
5016 lowest_priority = processor->current_pri;
5017 lp_processor = processor;
5018
5019 if (processor->current_pri >= BASEPRI_RTQUEUES) {
5020 furthest_deadline = processor->deadline;
5021 fd_processor = processor;
5022 }
5023
5024 lowest_count = SCHED(processor_runq_count)(processor);
5025 lc_processor = processor;
5026 }
5027
5028 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5029 pset_node_t node = pset->node;
5030 int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0);
5031 for (; consider_secondaries < 2; consider_secondaries++) {
5032 pset = change_locked_pset(pset, starting_pset);
5033 do {
5034 cpumap_t available_map = pset_available_cpumap(pset);
5035 if (available_map == 0) {
5036 goto no_available_cpus;
5037 }
5038
5039 processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5040 if (processor) {
5041 return processor;
5042 }
5043
5044 if (consider_secondaries) {
5045 processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false);
5046 if (processor && (processor->deadline > furthest_deadline)) {
5047 fd_processor = processor;
5048 furthest_deadline = processor->deadline;
5049 if (sched_choose_first_fd_processor && ((rt_constraint_ll == 0) || (furthest_deadline > rt_constraint_ll + mach_absolute_time()))) {
5050 /*
5051 * Instead of looping through all the psets to find the global
5052 * furthest deadline processor, preempt the first candidate found.
5053 * The preempted thread will then find any other available far deadline
5054 * processors to preempt.
5055 */
5056 return fd_processor;
5057 }
5058 }
5059
5060 if (rt_runq_count(pset) < lowest_count) {
5061 int cpuid = bit_first(available_map);
5062 assert(cpuid >= 0);
5063 lc_processor = processor_array[cpuid];
5064 lowest_count = rt_runq_count(pset);
5065 }
5066 }
5067
5068 no_available_cpus:
5069 nset = next_pset(pset);
5070
5071 if (nset != starting_pset) {
5072 pset = change_locked_pset(pset, nset);
5073 }
5074 } while (nset != starting_pset);
5075 }
5076
5077 /* Short cut for single pset nodes */
5078 if (bit_count(node->pset_map) == 1) {
5079 if (fd_processor) {
5080 pset_assert_locked(fd_processor->processor_set);
5081 return fd_processor;
5082 } else if (lc_processor) {
5083 pset_assert_locked(lc_processor->processor_set);
5084 return lc_processor;
5085 }
5086 }
5087
5088 processor = PROCESSOR_NULL;
5089 if (fd_processor) {
5090 processor = fd_processor;
5091 } else if (lc_processor) {
5092 processor = lc_processor;
5093 }
5094
5095 if (processor) {
5096 pset = change_locked_pset(pset, processor->processor_set);
5097 /* Check that chosen processor is still usable */
5098 cpumap_t available_map = pset_available_cpumap(pset);
5099 if (bit_test(available_map, processor->cpu_id)) {
5100 return processor;
5101 }
5102
5103 /* processor is no longer usable */
5104 processor = PROCESSOR_NULL;
5105 }
5106
5107 pset_assert_locked(pset);
5108 pset_unlock(pset);
5109 return PROCESSOR_NULL;
5110 }
5111
5112 /* No realtime threads from this point on */
5113 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5114
5115 do {
5116 /*
5117 * Choose an idle processor, in pset traversal order
5118 */
5119
5120 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5121 pset->primary_map &
5122 pset->recommended_bitmask);
5123
5124 /* there shouldn't be a pending AST if the processor is idle */
5125 assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5126
5127 int cpuid = lsb_first(idle_primary_map);
5128 if (cpuid >= 0) {
5129 processor = processor_array[cpuid];
5130 return processor;
5131 }
5132
5133 /*
5134 * Otherwise, enumerate active and idle processors to find primary candidates
5135 * with lower priority/etc.
5136 */
5137
5138 uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5139 pset->recommended_bitmask &
5140 ~pset->pending_AST_URGENT_cpu_mask);
5141
5142 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5143 active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5144 }
5145
5146 active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5147 for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5148 cpuid = ((rotid + pset->last_chosen + 1) & 63);
5149 processor = processor_array[cpuid];
5150
5151 integer_t cpri = processor->current_pri;
5152 processor_t primary = processor->processor_primary;
5153 if (primary != processor) {
5154 /* If primary is running a NO_SMT thread, don't choose its secondary */
5155 if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
5156 if (cpri < lowest_secondary_priority) {
5157 lowest_secondary_priority = cpri;
5158 lp_paired_secondary_processor = processor;
5159 }
5160 }
5161 } else {
5162 if (cpri < lowest_priority) {
5163 lowest_priority = cpri;
5164 lp_processor = processor;
5165 }
5166 }
5167
5168 integer_t ccount = SCHED(processor_runq_count)(processor);
5169 if (ccount < lowest_count) {
5170 lowest_count = ccount;
5171 lc_processor = processor;
5172 }
5173 }
5174
5175 /*
5176 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5177 * the idle primary would have short-circuited the loop above
5178 */
5179 uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5180 ~pset->primary_map &
5181 pset->recommended_bitmask);
5182
5183 /* there shouldn't be a pending AST if the processor is idle */
5184 assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5185 assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5186
5187 for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
5188 processor = processor_array[cpuid];
5189
5190 processor_t cprimary = processor->processor_primary;
5191
5192 integer_t primary_pri = cprimary->current_pri;
5193
5194 /*
5195 * TODO: This should also make the same decisions
5196 * as secondary_can_run_realtime_thread
5197 *
5198 * TODO: Keep track of the pending preemption priority
5199 * of the primary to make this more accurate.
5200 */
5201
5202 /* If the primary is running a no-smt thread, then don't choose its secondary */
5203 if (cprimary->state == PROCESSOR_RUNNING &&
5204 processor_active_thread_no_smt(cprimary)) {
5205 continue;
5206 }
5207
5208 /*
5209 * Find the idle secondary processor with the lowest priority primary
5210 *
5211 * We will choose this processor as a fallback if we find no better
5212 * primary to preempt.
5213 */
5214 if (primary_pri < lowest_idle_secondary_priority) {
5215 lp_idle_secondary_processor = processor;
5216 lowest_idle_secondary_priority = primary_pri;
5217 }
5218
5219 /* Find the the lowest priority active primary with idle secondary */
5220 if (primary_pri < lowest_unpaired_primary_priority) {
5221 /* If the primary processor is offline or starting up, it's not a candidate for this path */
5222 if (cprimary->state != PROCESSOR_RUNNING &&
5223 cprimary->state != PROCESSOR_DISPATCHING) {
5224 continue;
5225 }
5226
5227 if (!cprimary->is_recommended) {
5228 continue;
5229 }
5230
5231 /* if the primary is pending preemption, don't try to re-preempt it */
5232 if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5233 continue;
5234 }
5235
5236 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5237 bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5238 continue;
5239 }
5240
5241 lowest_unpaired_primary_priority = primary_pri;
5242 lp_unpaired_primary_processor = cprimary;
5243 }
5244 }
5245
5246 /*
5247 * We prefer preempting a primary processor over waking up its secondary.
5248 * The secondary will then be woken up by the preempted thread.
5249 */
5250 if (thread->sched_pri > lowest_unpaired_primary_priority) {
5251 pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5252 return lp_unpaired_primary_processor;
5253 }
5254
5255 /*
5256 * We prefer preempting a lower priority active processor over directly
5257 * waking up an idle secondary.
5258 * The preempted thread will then find the idle secondary.
5259 */
5260 if (thread->sched_pri > lowest_priority) {
5261 pset->last_chosen = lp_processor->cpu_id;
5262 return lp_processor;
5263 }
5264
5265 /*
5266 * lc_processor is used to indicate the best processor set run queue
5267 * on which to enqueue a thread when all available CPUs are busy with
5268 * higher priority threads, so try to make sure it is initialized.
5269 */
5270 if (lc_processor == PROCESSOR_NULL) {
5271 cpumap_t available_map = pset_available_cpumap(pset);
5272 cpuid = lsb_first(available_map);
5273 if (cpuid >= 0) {
5274 lc_processor = processor_array[cpuid];
5275 lowest_count = SCHED(processor_runq_count)(lc_processor);
5276 }
5277 }
5278
5279 /*
5280 * Move onto the next processor set.
5281 *
5282 * If all primary processors in this pset are running a higher
5283 * priority thread, move on to next pset. Only when we have
5284 * exhausted the search for primary processors do we
5285 * fall back to secondaries.
5286 */
5287 #if CONFIG_SCHED_EDGE
5288 /*
5289 * The edge scheduler expects a CPU to be selected from the pset it passed in
5290 * as the starting pset for non-RT workloads. The edge migration algorithm
5291 * should already have considered idle CPUs and loads to decide the starting_pset;
5292 * which means that this loop can be short-circuted.
5293 */
5294 nset = starting_pset;
5295 #else /* CONFIG_SCHED_EDGE */
5296 nset = next_pset(pset);
5297 #endif /* CONFIG_SCHED_EDGE */
5298
5299 if (nset != starting_pset) {
5300 pset = change_locked_pset(pset, nset);
5301 }
5302 } while (nset != starting_pset);
5303
5304 /*
5305 * Make sure that we pick a running processor,
5306 * and that the correct processor set is locked.
5307 * Since we may have unlocked the candidate processor's
5308 * pset, it may have changed state.
5309 *
5310 * All primary processors are running a higher priority
5311 * thread, so the only options left are enqueuing on
5312 * the secondary processor that would perturb the least priority
5313 * primary, or the least busy primary.
5314 */
5315
5316 /* lowest_priority is evaluated in the main loops above */
5317 if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5318 processor = lp_idle_secondary_processor;
5319 } else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5320 processor = lp_paired_secondary_processor;
5321 } else if (lc_processor != PROCESSOR_NULL) {
5322 processor = lc_processor;
5323 } else {
5324 processor = PROCESSOR_NULL;
5325 }
5326
5327 if (processor) {
5328 pset = change_locked_pset(pset, processor->processor_set);
5329 /* Check that chosen processor is still usable */
5330 cpumap_t available_map = pset_available_cpumap(pset);
5331 if (bit_test(available_map, processor->cpu_id)) {
5332 pset->last_chosen = processor->cpu_id;
5333 return processor;
5334 }
5335
5336 /* processor is no longer usable */
5337 processor = PROCESSOR_NULL;
5338 }
5339
5340 pset_assert_locked(pset);
5341 pset_unlock(pset);
5342 return PROCESSOR_NULL;
5343 }
5344
5345 /*
5346 * Default implementation of SCHED(choose_node)()
5347 * for single node systems
5348 */
5349 pset_node_t
sched_choose_node(__unused thread_t thread)5350 sched_choose_node(__unused thread_t thread)
5351 {
5352 return &pset_node0;
5353 }
5354
5355 /*
5356 * choose_starting_pset:
5357 *
5358 * Choose a starting processor set for the thread.
5359 * May return a processor hint within the pset.
5360 *
5361 * Returns a starting processor set, to be used by
5362 * choose_processor.
5363 *
5364 * The thread must be locked. The resulting pset is unlocked on return,
5365 * and is chosen without taking any pset locks.
5366 */
5367 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)5368 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5369 {
5370 processor_set_t pset;
5371 processor_t processor = PROCESSOR_NULL;
5372
5373 if (thread->affinity_set != AFFINITY_SET_NULL) {
5374 /*
5375 * Use affinity set policy hint.
5376 */
5377 pset = thread->affinity_set->aset_pset;
5378 } else if (thread->last_processor != PROCESSOR_NULL) {
5379 /*
5380 * Simple (last processor) affinity case.
5381 */
5382 processor = thread->last_processor;
5383 pset = processor->processor_set;
5384 } else {
5385 /*
5386 * No Affinity case:
5387 *
5388 * Utilitize a per task hint to spread threads
5389 * among the available processor sets.
5390 * NRG this seems like the wrong thing to do.
5391 * See also task->pset_hint = pset in thread_setrun()
5392 */
5393 pset = get_threadtask(thread)->pset_hint;
5394 if (pset == PROCESSOR_SET_NULL) {
5395 pset = current_processor()->processor_set;
5396 }
5397
5398 pset = choose_next_pset(pset);
5399 }
5400
5401 if (!bit_test(node->pset_map, pset->pset_id)) {
5402 /* pset is not from this node so choose one that is */
5403 int id = lsb_first(node->pset_map);
5404 if (id < 0) {
5405 /* startup race, so check again under the node lock */
5406 lck_spin_lock(&pset_node_lock);
5407 if (bit_test(node->pset_map, pset->pset_id)) {
5408 id = pset->pset_id;
5409 } else {
5410 id = lsb_first(node->pset_map);
5411 }
5412 lck_spin_unlock(&pset_node_lock);
5413 }
5414 assert(id >= 0);
5415 pset = pset_array[id];
5416 }
5417
5418 if (bit_count(node->pset_map) == 1) {
5419 /* Only a single pset in this node */
5420 goto out;
5421 }
5422
5423 bool avoid_cpu0 = false;
5424
5425 #if defined(__x86_64__)
5426 if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5427 /* Avoid the pset containing cpu0 */
5428 avoid_cpu0 = true;
5429 /* Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ */
5430 assert(bit_test(pset_array[0]->cpu_bitmask, 0));
5431 }
5432 #endif
5433
5434 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5435 pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5436 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5437 if (avoid_cpu0) {
5438 rt_target_map = bit_ror64(rt_target_map, 1);
5439 }
5440 int rotid = lsb_first(rt_target_map);
5441 if (rotid >= 0) {
5442 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5443 pset = pset_array[id];
5444 goto out;
5445 }
5446 }
5447 if (!pset->is_SMT || !sched_allow_rt_smt) {
5448 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5449 goto out;
5450 }
5451 rt_target_map = atomic_load(&node->pset_non_rt_map);
5452 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5453 if (avoid_cpu0) {
5454 rt_target_map = bit_ror64(rt_target_map, 1);
5455 }
5456 int rotid = lsb_first(rt_target_map);
5457 if (rotid >= 0) {
5458 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5459 pset = pset_array[id];
5460 goto out;
5461 }
5462 }
5463 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5464 } else {
5465 pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5466 if (!bit_test(idle_map, pset->pset_id)) {
5467 int next_idle_pset_id = lsb_first(idle_map);
5468 if (next_idle_pset_id >= 0) {
5469 pset = pset_array[next_idle_pset_id];
5470 }
5471 }
5472 }
5473
5474 out:
5475 if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5476 processor = PROCESSOR_NULL;
5477 }
5478 if (processor != PROCESSOR_NULL) {
5479 *processor_hint = processor;
5480 }
5481
5482 assert(pset != NULL);
5483 return pset;
5484 }
5485
5486 /*
5487 * thread_setrun:
5488 *
5489 * Dispatch thread for execution, onto an idle
5490 * processor or run queue, and signal a preemption
5491 * as appropriate.
5492 *
5493 * Thread must be locked.
5494 */
5495 void
thread_setrun(thread_t thread,sched_options_t options)5496 thread_setrun(
5497 thread_t thread,
5498 sched_options_t options)
5499 {
5500 processor_t processor = PROCESSOR_NULL;
5501 processor_set_t pset;
5502
5503 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5504 assert(thread->runq == PROCESSOR_NULL);
5505
5506 #if CONFIG_PREADOPT_TG
5507 /* We know that the thread is not in the runq by virtue of being in this
5508 * function and the thread is not self since we are running. We can safely
5509 * resolve the thread group hierarchy and modify the thread's thread group
5510 * here. */
5511 thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
5512 #endif
5513
5514 /*
5515 * Update priority if needed.
5516 */
5517 if (SCHED(can_update_priority)(thread)) {
5518 SCHED(update_priority)(thread);
5519 }
5520 thread->sfi_class = sfi_thread_classify(thread);
5521
5522 if (thread->bound_processor == PROCESSOR_NULL) {
5523 /*
5524 * Unbound case.
5525 *
5526 * Usually, this loop will only be executed once,
5527 * but if CLPC derecommends a processor after it has been chosen,
5528 * or if a processor is shut down after it is chosen,
5529 * choose_processor() may return NULL, so a retry
5530 * may be necessary. A single retry will usually
5531 * be enough, and we can't afford to retry too many times
5532 * because interrupts are disabled.
5533 */
5534 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
5535 for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5536 processor_t processor_hint = PROCESSOR_NULL;
5537 pset_node_t node = SCHED(choose_node)(thread);
5538 processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
5539
5540 pset_lock(starting_pset);
5541
5542 processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5543 if (processor != PROCESSOR_NULL) {
5544 pset = processor->processor_set;
5545 pset_assert_locked(pset);
5546 break;
5547 }
5548 }
5549 /*
5550 * If choose_processor() still returns NULL,
5551 * which is very unlikely,
5552 * choose the master_processor, which is always
5553 * safe to choose.
5554 */
5555 if (processor == PROCESSOR_NULL) {
5556 /* Choose fallback processor */
5557 processor = master_processor;
5558 pset = processor->processor_set;
5559 pset_lock(pset);
5560 }
5561 task_t task = get_threadtask(thread);
5562 if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5563 task->pset_hint = pset; /* NRG this is done without holding the task lock */
5564 }
5565 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5566 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5567 } else {
5568 /*
5569 * Bound case:
5570 *
5571 * Unconditionally dispatch on the processor.
5572 */
5573 processor = thread->bound_processor;
5574 pset = processor->processor_set;
5575 pset_lock(pset);
5576
5577 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5578 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5579 }
5580
5581 /*
5582 * Dispatch the thread on the chosen processor.
5583 * TODO: This should be based on sched_mode, not sched_pri
5584 */
5585 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5586 realtime_setrun(processor, thread);
5587 } else {
5588 processor_setrun(processor, thread, options);
5589 }
5590 /* pset is now unlocked */
5591 if (thread->bound_processor == PROCESSOR_NULL) {
5592 SCHED(check_spill)(pset, thread);
5593 }
5594 }
5595
5596 processor_set_t
task_choose_pset(task_t task)5597 task_choose_pset(
5598 task_t task)
5599 {
5600 processor_set_t pset = task->pset_hint;
5601
5602 if (pset != PROCESSOR_SET_NULL) {
5603 pset = choose_next_pset(pset);
5604 }
5605
5606 return pset;
5607 }
5608
5609 /*
5610 * Check for a preemption point in
5611 * the current context.
5612 *
5613 * Called at splsched with thread locked.
5614 */
5615 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)5616 csw_check(
5617 thread_t thread,
5618 processor_t processor,
5619 ast_t check_reason)
5620 {
5621 processor_set_t pset = processor->processor_set;
5622
5623 assert(thread == processor->active_thread);
5624
5625 pset_lock(pset);
5626
5627 processor_state_update_from_thread(processor, thread, true);
5628
5629 ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5630
5631 /* Acknowledge the IPI if we decided not to preempt */
5632
5633 if ((preempt & AST_URGENT) == 0) {
5634 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5635 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5636 }
5637 }
5638
5639 if ((preempt & AST_PREEMPT) == 0) {
5640 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5641 }
5642
5643 pset_unlock(pset);
5644
5645 return preempt;
5646 }
5647
5648 /*
5649 * Check for preemption at splsched with
5650 * pset and thread locked
5651 */
5652 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)5653 csw_check_locked(
5654 thread_t thread,
5655 processor_t processor,
5656 processor_set_t pset,
5657 ast_t check_reason)
5658 {
5659 /*
5660 * If the current thread is running on a processor that is no longer recommended,
5661 * urgently preempt it, at which point thread_select() should
5662 * try to idle the processor and re-dispatch the thread to a recommended processor.
5663 */
5664 if (!processor->is_recommended) {
5665 return check_reason | AST_PREEMPT | AST_URGENT;
5666 }
5667
5668 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5669 return check_reason | AST_PREEMPT | AST_URGENT;
5670 }
5671
5672 if (rt_runq_count(pset) > 0) {
5673 if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5674 return check_reason | AST_PREEMPT | AST_URGENT;
5675 } else {
5676 return check_reason | AST_PREEMPT;
5677 }
5678 }
5679
5680 ast_t result = SCHED(processor_csw_check)(processor);
5681 if (result != AST_NONE) {
5682 return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
5683 }
5684
5685 /*
5686 * Same for avoid-processor
5687 *
5688 * TODO: Should these set AST_REBALANCE?
5689 */
5690 if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)) {
5691 return check_reason | AST_PREEMPT;
5692 }
5693
5694 /*
5695 * Even though we could continue executing on this processor, a
5696 * secondary SMT core should try to shed load to another primary core.
5697 *
5698 * TODO: Should this do the same check that thread_select does? i.e.
5699 * if no bound threads target this processor, and idle primaries exist, preempt
5700 * The case of RT threads existing is already taken care of above
5701 */
5702
5703 if (processor->current_pri < BASEPRI_RTQUEUES &&
5704 processor->processor_primary != processor) {
5705 return check_reason | AST_PREEMPT;
5706 }
5707
5708 if (thread->state & TH_SUSP) {
5709 return check_reason | AST_PREEMPT;
5710 }
5711
5712 #if CONFIG_SCHED_SFI
5713 /*
5714 * Current thread may not need to be preempted, but maybe needs
5715 * an SFI wait?
5716 */
5717 result = sfi_thread_needs_ast(thread, NULL);
5718 if (result != AST_NONE) {
5719 return check_reason | result;
5720 }
5721 #endif
5722
5723 return AST_NONE;
5724 }
5725
5726 static void
ast_ack_if_needed(processor_t processor)5727 ast_ack_if_needed(processor_t processor)
5728 {
5729 struct ast_gen_pair *pair = PERCPU_GET_RELATIVE(ast_gen_pair, processor, processor);
5730 ast_gen_t gen;
5731
5732 /*
5733 * Make sure that if we observe a new generation, we ack it.
5734 *
5735 * Note that this ack might ack for a cause_ast_check()
5736 * that hasn't happened yet: 2 different cores A and B could
5737 * have called ast_generation_get(), we observe B's generation
5738 * already, before B has had a chance to call cause_ast_check() yet.
5739 *
5740 * This still preserves the property that we want,
5741 * which is that `processor` has been in ast_check()
5742 * _after_ ast_generation_get() was called.
5743 */
5744
5745 gen = os_atomic_load(&pair->ast_gen, relaxed);
5746 if (gen != os_atomic_load(&pair->ast_ack, relaxed)) {
5747 /* pairs with the fence in ast_generation_get() */
5748 os_atomic_thread_fence(acq_rel);
5749 os_atomic_store(&pair->ast_ack, gen, relaxed);
5750 }
5751 }
5752
5753 /*
5754 * Handle preemption IPI or IPI in response to setting an AST flag
5755 * Triggered by cause_ast_check
5756 * Called at splsched
5757 */
5758 void
ast_check(processor_t processor)5759 ast_check(processor_t processor)
5760 {
5761 if (processor->state != PROCESSOR_RUNNING &&
5762 processor->state != PROCESSOR_SHUTDOWN) {
5763 ast_ack_if_needed(processor);
5764 return;
5765 }
5766
5767 thread_t thread = processor->active_thread;
5768
5769 assert(thread == current_thread());
5770
5771 thread_lock(thread);
5772
5773 ast_ack_if_needed(processor);
5774 /*
5775 * Propagate thread ast to processor.
5776 * (handles IPI in response to setting AST flag)
5777 */
5778 ast_propagate(thread);
5779
5780 /*
5781 * Stash the old urgency and perfctl values to find out if
5782 * csw_check updates them.
5783 */
5784 thread_urgency_t old_urgency = processor->current_urgency;
5785 perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
5786
5787 ast_t preempt;
5788
5789 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5790 ast_on(preempt);
5791 }
5792
5793 if (old_urgency != processor->current_urgency) {
5794 /*
5795 * Urgency updates happen with the thread lock held (ugh).
5796 * TODO: This doesn't notice QoS changes...
5797 */
5798 uint64_t urgency_param1, urgency_param2;
5799
5800 thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
5801 thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
5802 }
5803
5804 thread_unlock(thread);
5805
5806 if (old_perfctl_class != processor->current_perfctl_class) {
5807 /*
5808 * We updated the perfctl class of this thread from another core.
5809 * Let CLPC know that the currently running thread has a new
5810 * class.
5811 */
5812
5813 machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
5814 mach_approximate_time(), 0, thread);
5815 }
5816 }
5817
5818 void
ast_generation_get(processor_t processor,ast_gen_t gens[])5819 ast_generation_get(processor_t processor, ast_gen_t gens[])
5820 {
5821 struct ast_gen_pair *pair = PERCPU_GET_RELATIVE(ast_gen_pair, processor, processor);
5822
5823 gens[processor->cpu_id] = os_atomic_add(&pair->ast_gen, 2, release);
5824 }
5825
5826 void
ast_generation_wait(ast_gen_t gens[MAX_CPUS])5827 ast_generation_wait(ast_gen_t gens[MAX_CPUS])
5828 {
5829 percpu_foreach(cpup, processor) {
5830 struct ast_gen_pair *pair;
5831 ast_gen_t gen_ack;
5832 uint32_t cpu = cpup->cpu_id;
5833
5834 if (gens[cpu] == 0) {
5835 continue;
5836 }
5837 pair = PERCPU_GET_RELATIVE(ast_gen_pair, processor, cpup);
5838 gen_ack = os_atomic_load(&pair->ast_ack, relaxed);
5839 while (__improbable(AST_GEN_CMP(gen_ack, <, gens[cpu]))) {
5840 disable_preemption();
5841 gen_ack = (unsigned long)hw_wait_while_equals((void **)(uintptr_t)&pair->ast_ack, (void *)gen_ack);
5842 enable_preemption();
5843 }
5844 }
5845 }
5846
5847
5848 /*
5849 * set_sched_pri:
5850 *
5851 * Set the scheduled priority of the specified thread.
5852 *
5853 * This may cause the thread to change queues.
5854 *
5855 * Thread must be locked.
5856 */
5857 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)5858 set_sched_pri(
5859 thread_t thread,
5860 int16_t new_priority,
5861 set_sched_pri_options_t options)
5862 {
5863 bool is_current_thread = (thread == current_thread());
5864 bool removed_from_runq = false;
5865 bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
5866
5867 int16_t old_priority = thread->sched_pri;
5868
5869 /* If we're already at this priority, no need to mess with the runqueue */
5870 if (new_priority == old_priority) {
5871 #if CONFIG_SCHED_CLUTCH
5872 /* For the first thread in the system, the priority is correct but
5873 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
5874 * scheduler relies on the bucket being set for all threads, update
5875 * its bucket here.
5876 */
5877 if (thread->th_sched_bucket == TH_BUCKET_RUN) {
5878 assert(thread == vm_pageout_scan_thread);
5879 SCHED(update_thread_bucket)(thread);
5880 }
5881 #endif /* CONFIG_SCHED_CLUTCH */
5882
5883 return;
5884 }
5885
5886 if (is_current_thread) {
5887 assert(thread->state & TH_RUN);
5888 assert(thread->runq == PROCESSOR_NULL);
5889 } else {
5890 removed_from_runq = thread_run_queue_remove(thread);
5891 }
5892
5893 thread->sched_pri = new_priority;
5894
5895 #if CONFIG_SCHED_CLUTCH
5896 /*
5897 * Since for the clutch scheduler, the thread's bucket determines its runq
5898 * in the hierarchy it is important to update the bucket when the thread
5899 * lock is held and the thread has been removed from the runq hierarchy.
5900 */
5901 SCHED(update_thread_bucket)(thread);
5902
5903 #endif /* CONFIG_SCHED_CLUTCH */
5904
5905 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
5906 (uintptr_t)thread_tid(thread),
5907 thread->base_pri,
5908 thread->sched_pri,
5909 thread->sched_usage,
5910 0);
5911
5912 if (removed_from_runq) {
5913 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
5914 } else if (is_current_thread) {
5915 processor_t processor = thread->last_processor;
5916 assert(processor == current_processor());
5917
5918 thread_urgency_t old_urgency = processor->current_urgency;
5919
5920 /*
5921 * When dropping in priority, check if the thread no longer belongs on core.
5922 * If a thread raises its own priority, don't aggressively rebalance it.
5923 * <rdar://problem/31699165>
5924 *
5925 * csw_check does a processor_state_update_from_thread, but
5926 * we should do our own if we're being lazy.
5927 */
5928 if (!lazy_update && new_priority < old_priority) {
5929 ast_t preempt;
5930
5931 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5932 ast_on(preempt);
5933 }
5934 } else {
5935 processor_state_update_from_thread(processor, thread, false);
5936 }
5937
5938 /*
5939 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
5940 * class alterations from user space to occur relatively infrequently, hence
5941 * those are lazily handled. QoS classes have distinct priority bands, and QoS
5942 * inheritance is expected to involve priority changes.
5943 */
5944 if (processor->current_urgency != old_urgency) {
5945 uint64_t urgency_param1, urgency_param2;
5946
5947 thread_urgency_t new_urgency = thread_get_urgency(thread,
5948 &urgency_param1, &urgency_param2);
5949
5950 thread_tell_urgency(new_urgency, urgency_param1,
5951 urgency_param2, 0, thread);
5952 }
5953
5954 /* TODO: only call this if current_perfctl_class changed */
5955 uint64_t ctime = mach_approximate_time();
5956 machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
5957 } else if (thread->state & TH_RUN) {
5958 processor_t processor = thread->last_processor;
5959
5960 if (!lazy_update &&
5961 processor != PROCESSOR_NULL &&
5962 processor != current_processor() &&
5963 processor->active_thread == thread) {
5964 cause_ast_check(processor);
5965 }
5966 }
5967 }
5968
5969 /*
5970 * thread_run_queue_remove_for_handoff
5971 *
5972 * Pull a thread or its (recursive) push target out of the runqueue
5973 * so that it is ready for thread_run()
5974 *
5975 * Called at splsched
5976 *
5977 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
5978 * This may be different than the thread that was passed in.
5979 */
5980 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)5981 thread_run_queue_remove_for_handoff(thread_t thread)
5982 {
5983 thread_t pulled_thread = THREAD_NULL;
5984
5985 thread_lock(thread);
5986
5987 /*
5988 * Check that the thread is not bound to a different processor,
5989 * NO_SMT flag is not set on the thread, cluster type of
5990 * processor matches with thread if the thread is pinned to a
5991 * particular cluster and that realtime is not involved.
5992 *
5993 * Next, pull it off its run queue. If it doesn't come, it's not eligible.
5994 */
5995 processor_t processor = current_processor();
5996 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
5997 && (!thread_no_smt(thread))
5998 && (processor->current_pri < BASEPRI_RTQUEUES)
5999 && (thread->sched_pri < BASEPRI_RTQUEUES)
6000 #if __AMP__
6001 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6002 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6003 #endif /* __AMP__ */
6004 ) {
6005 if (thread_run_queue_remove(thread)) {
6006 pulled_thread = thread;
6007 }
6008 }
6009
6010 thread_unlock(thread);
6011
6012 return pulled_thread;
6013 }
6014
6015 /*
6016 * thread_prepare_for_handoff
6017 *
6018 * Make the thread ready for handoff.
6019 * If the thread was runnable then pull it off the runq, if the thread could
6020 * not be pulled, return NULL.
6021 *
6022 * If the thread was woken up from wait for handoff, make sure it is not bound to
6023 * different processor.
6024 *
6025 * Called at splsched
6026 *
6027 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6028 * This may be different than the thread that was passed in.
6029 */
6030 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)6031 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6032 {
6033 thread_t pulled_thread = THREAD_NULL;
6034
6035 if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6036 processor_t processor = current_processor();
6037 thread_lock(thread);
6038
6039 /*
6040 * Check that the thread is not bound to a different processor,
6041 * NO_SMT flag is not set on the thread and cluster type of
6042 * processor matches with thread if the thread is pinned to a
6043 * particular cluster. Call setrun instead if above conditions
6044 * are not satisfied.
6045 */
6046 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6047 && (!thread_no_smt(thread))
6048 #if __AMP__
6049 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6050 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6051 #endif /* __AMP__ */
6052 ) {
6053 pulled_thread = thread;
6054 } else {
6055 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
6056 }
6057 thread_unlock(thread);
6058 } else {
6059 pulled_thread = thread_run_queue_remove_for_handoff(thread);
6060 }
6061
6062 return pulled_thread;
6063 }
6064
6065 /*
6066 * thread_run_queue_remove:
6067 *
6068 * Remove a thread from its current run queue and
6069 * return TRUE if successful.
6070 *
6071 * Thread must be locked.
6072 *
6073 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6074 * run queues because the caller locked the thread. Otherwise
6075 * the thread is on a run queue, but could be chosen for dispatch
6076 * and removed by another processor under a different lock, which
6077 * will set thread->runq to PROCESSOR_NULL.
6078 *
6079 * Hence the thread select path must not rely on anything that could
6080 * be changed under the thread lock after calling this function,
6081 * most importantly thread->sched_pri.
6082 */
6083 boolean_t
thread_run_queue_remove(thread_t thread)6084 thread_run_queue_remove(
6085 thread_t thread)
6086 {
6087 boolean_t removed = FALSE;
6088 processor_t processor = thread->runq;
6089
6090 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6091 /* Thread isn't runnable */
6092 assert(thread->runq == PROCESSOR_NULL);
6093 return FALSE;
6094 }
6095
6096 if (processor == PROCESSOR_NULL) {
6097 /*
6098 * The thread is either not on the runq,
6099 * or is in the midst of being removed from the runq.
6100 *
6101 * runq is set to NULL under the pset lock, not the thread
6102 * lock, so the thread may still be in the process of being dequeued
6103 * from the runq. It will wait in invoke for the thread lock to be
6104 * dropped.
6105 */
6106
6107 return FALSE;
6108 }
6109
6110 if (thread->sched_pri < BASEPRI_RTQUEUES) {
6111 return SCHED(processor_queue_remove)(processor, thread);
6112 }
6113
6114 processor_set_t pset = processor->processor_set;
6115
6116 pset_lock(pset);
6117
6118 if (thread->runq != PROCESSOR_NULL) {
6119 /*
6120 * Thread is on the RT run queue and we have a lock on
6121 * that run queue.
6122 */
6123 rt_runq_remove(SCHED(rt_runq)(pset), thread);
6124 pset_update_rt_stealable_state(pset);
6125
6126 removed = TRUE;
6127 }
6128
6129 pset_unlock(pset);
6130
6131 return removed;
6132 }
6133
6134 /*
6135 * Put the thread back where it goes after a thread_run_queue_remove
6136 *
6137 * Thread must have been removed under the same thread lock hold
6138 *
6139 * thread locked, at splsched
6140 */
6141 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)6142 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6143 {
6144 assert(thread->runq == PROCESSOR_NULL);
6145 assert(thread->state & (TH_RUN));
6146
6147 thread_setrun(thread, options);
6148 }
6149
6150 void
sys_override_cpu_throttle(boolean_t enable_override)6151 sys_override_cpu_throttle(boolean_t enable_override)
6152 {
6153 if (enable_override) {
6154 cpu_throttle_enabled = 0;
6155 } else {
6156 cpu_throttle_enabled = 1;
6157 }
6158 }
6159
6160 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)6161 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6162 {
6163 uint64_t urgency_param1 = 0, urgency_param2 = 0;
6164 task_t task = get_threadtask_early(thread);
6165
6166 thread_urgency_t urgency;
6167
6168 if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6169 urgency_param1 = 0;
6170 urgency_param2 = 0;
6171
6172 urgency = THREAD_URGENCY_NONE;
6173 } else if (thread->sched_mode == TH_MODE_REALTIME) {
6174 urgency_param1 = thread->realtime.period;
6175 urgency_param2 = thread->realtime.deadline;
6176
6177 urgency = THREAD_URGENCY_REAL_TIME;
6178 } else if (cpu_throttle_enabled &&
6179 (thread->sched_pri <= MAXPRI_THROTTLE) &&
6180 (thread->base_pri <= MAXPRI_THROTTLE)) {
6181 /*
6182 * Threads that are running at low priority but are not
6183 * tagged with a specific QoS are separated out from
6184 * the "background" urgency. Performance management
6185 * subsystem can decide to either treat these threads
6186 * as normal threads or look at other signals like thermal
6187 * levels for optimal power/perf tradeoffs for a platform.
6188 */
6189 boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6190 boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6191
6192 /*
6193 * Background urgency applied when thread priority is
6194 * MAXPRI_THROTTLE or lower and thread is not promoted
6195 * and thread has a QoS specified
6196 */
6197 urgency_param1 = thread->sched_pri;
6198 urgency_param2 = thread->base_pri;
6199
6200 if (thread_lacks_qos && !task_is_suppressed) {
6201 urgency = THREAD_URGENCY_LOWPRI;
6202 } else {
6203 urgency = THREAD_URGENCY_BACKGROUND;
6204 }
6205 } else {
6206 /* For otherwise unclassified threads, report throughput QoS parameters */
6207 urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6208 urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6209 urgency = THREAD_URGENCY_NORMAL;
6210 }
6211
6212 if (arg1 != NULL) {
6213 *arg1 = urgency_param1;
6214 }
6215 if (arg2 != NULL) {
6216 *arg2 = urgency_param2;
6217 }
6218
6219 return urgency;
6220 }
6221
6222 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)6223 thread_get_perfcontrol_class(thread_t thread)
6224 {
6225 /* Special case handling */
6226 if (thread->state & TH_IDLE) {
6227 return PERFCONTROL_CLASS_IDLE;
6228 }
6229 if (thread->sched_mode == TH_MODE_REALTIME) {
6230 return PERFCONTROL_CLASS_REALTIME;
6231 }
6232
6233 /* perfcontrol_class based on base_pri */
6234 if (thread->base_pri <= MAXPRI_THROTTLE) {
6235 return PERFCONTROL_CLASS_BACKGROUND;
6236 } else if (thread->base_pri <= BASEPRI_UTILITY) {
6237 return PERFCONTROL_CLASS_UTILITY;
6238 } else if (thread->base_pri <= BASEPRI_DEFAULT) {
6239 return PERFCONTROL_CLASS_NONUI;
6240 } else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6241 return PERFCONTROL_CLASS_UI;
6242 } else {
6243 if (get_threadtask(thread) == kernel_task) {
6244 /*
6245 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6246 * All other lower priority kernel threads should be treated
6247 * as regular threads for performance control purposes.
6248 */
6249 return PERFCONTROL_CLASS_KERNEL;
6250 }
6251 return PERFCONTROL_CLASS_ABOVEUI;
6252 }
6253 }
6254
6255 /*
6256 * This is the processor idle loop, which just looks for other threads
6257 * to execute. Processor idle threads invoke this without supplying a
6258 * current thread to idle without an asserted wait state.
6259 *
6260 * Returns a the next thread to execute if dispatched directly.
6261 */
6262
6263 #if 0
6264 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6265 #else
6266 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6267 #endif
6268
6269 #if (DEVELOPMENT || DEBUG)
6270 int sched_idle_delay_cpuid = -1;
6271 #endif
6272
6273 thread_t
processor_idle(thread_t thread,processor_t processor)6274 processor_idle(
6275 thread_t thread,
6276 processor_t processor)
6277 {
6278 processor_set_t pset = processor->processor_set;
6279
6280 (void)splsched();
6281
6282 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6283 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6284 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6285
6286 SCHED_STATS_INC(idle_transitions);
6287 assert(processor->running_timers_active == false);
6288
6289 uint64_t ctime = mach_absolute_time();
6290
6291 timer_switch(&processor->system_state, ctime, &processor->idle_state);
6292 processor->current_state = &processor->idle_state;
6293
6294 cpu_quiescent_counter_leave(ctime);
6295
6296 while (1) {
6297 /*
6298 * Ensure that updates to my processor and pset state,
6299 * made by the IPI source processor before sending the IPI,
6300 * are visible on this processor now (even though we don't
6301 * take the pset lock yet).
6302 */
6303 atomic_thread_fence(memory_order_acquire);
6304
6305 if (processor->state != PROCESSOR_IDLE) {
6306 break;
6307 }
6308 if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6309 break;
6310 }
6311 #if defined(CONFIG_SCHED_DEFERRED_AST)
6312 if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6313 break;
6314 }
6315 #endif
6316 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6317 break;
6318 }
6319
6320 if (processor->is_recommended && (processor->processor_primary == processor)) {
6321 if (rt_runq_count(pset)) {
6322 break;
6323 }
6324 } else {
6325 if (SCHED(processor_bound_count)(processor)) {
6326 break;
6327 }
6328 }
6329
6330 IDLE_KERNEL_DEBUG_CONSTANT(
6331 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6332
6333 machine_track_platform_idle(TRUE);
6334
6335 machine_idle();
6336 /* returns with interrupts enabled */
6337
6338 machine_track_platform_idle(FALSE);
6339
6340 #if (DEVELOPMENT || DEBUG)
6341 if (processor->cpu_id == sched_idle_delay_cpuid) {
6342 delay(500);
6343 }
6344 #endif
6345
6346 (void)splsched();
6347
6348 atomic_thread_fence(memory_order_acquire);
6349
6350 IDLE_KERNEL_DEBUG_CONSTANT(
6351 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6352
6353 /*
6354 * Check if we should call sched_timeshare_consider_maintenance() here.
6355 * The CPU was woken out of idle due to an interrupt and we should do the
6356 * call only if the processor is still idle. If the processor is non-idle,
6357 * the threads running on the processor would do the call as part of
6358 * context swithing.
6359 */
6360 if (processor->state == PROCESSOR_IDLE) {
6361 sched_timeshare_consider_maintenance(mach_absolute_time());
6362 }
6363
6364 if (!SCHED(processor_queue_empty)(processor)) {
6365 /* Secondary SMT processors respond to directed wakeups
6366 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6367 */
6368 if (processor->processor_primary == processor) {
6369 break;
6370 }
6371 }
6372 }
6373
6374 ctime = mach_absolute_time();
6375
6376 timer_switch(&processor->idle_state, ctime, &processor->system_state);
6377 processor->current_state = &processor->system_state;
6378
6379 cpu_quiescent_counter_join(ctime);
6380
6381 ast_t reason = AST_NONE;
6382
6383 /* We're handling all scheduling AST's */
6384 ast_off(AST_SCHEDULING);
6385
6386 /*
6387 * thread_select will move the processor from dispatching to running,
6388 * or put it in idle if there's nothing to do.
6389 */
6390 thread_t cur_thread = current_thread();
6391
6392 thread_lock(cur_thread);
6393 thread_t new_thread = thread_select(cur_thread, processor, &reason);
6394 thread_unlock(cur_thread);
6395
6396 assert(processor->running_timers_active == false);
6397
6398 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6399 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6400 (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6401
6402 return new_thread;
6403 }
6404
6405 /*
6406 * Each processor has a dedicated thread which
6407 * executes the idle loop when there is no suitable
6408 * previous context.
6409 *
6410 * This continuation is entered with interrupts disabled.
6411 */
6412 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)6413 idle_thread(__assert_only void* parameter,
6414 __unused wait_result_t result)
6415 {
6416 assert(ml_get_interrupts_enabled() == FALSE);
6417 assert(parameter == NULL);
6418
6419 processor_t processor = current_processor();
6420
6421 /*
6422 * Ensure that anything running in idle context triggers
6423 * preemption-disabled checks.
6424 */
6425 disable_preemption_without_measurements();
6426
6427 /*
6428 * Enable interrupts temporarily to handle any pending interrupts
6429 * or IPIs before deciding to sleep
6430 */
6431 spllo();
6432
6433 thread_t new_thread = processor_idle(THREAD_NULL, processor);
6434 /* returns with interrupts disabled */
6435
6436 enable_preemption();
6437
6438 if (new_thread != THREAD_NULL) {
6439 thread_run(processor->idle_thread,
6440 idle_thread, NULL, new_thread);
6441 /*NOTREACHED*/
6442 }
6443
6444 thread_block(idle_thread);
6445 /*NOTREACHED*/
6446 }
6447
6448 kern_return_t
idle_thread_create(processor_t processor)6449 idle_thread_create(
6450 processor_t processor)
6451 {
6452 kern_return_t result;
6453 thread_t thread;
6454 spl_t s;
6455 char name[MAXTHREADNAMESIZE];
6456
6457 result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
6458 if (result != KERN_SUCCESS) {
6459 return result;
6460 }
6461
6462 snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6463 thread_set_thread_name(thread, name);
6464
6465 s = splsched();
6466 thread_lock(thread);
6467 thread->bound_processor = processor;
6468 processor->idle_thread = thread;
6469 thread->sched_pri = thread->base_pri = IDLEPRI;
6470 thread->state = (TH_RUN | TH_IDLE);
6471 thread->options |= TH_OPT_IDLE_THREAD;
6472 thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6473 thread_unlock(thread);
6474 splx(s);
6475
6476 thread_deallocate(thread);
6477
6478 return KERN_SUCCESS;
6479 }
6480
6481 /*
6482 * sched_startup:
6483 *
6484 * Kicks off scheduler services.
6485 *
6486 * Called at splsched.
6487 */
6488 void
sched_startup(void)6489 sched_startup(void)
6490 {
6491 kern_return_t result;
6492 thread_t thread;
6493
6494 simple_lock_init(&sched_vm_group_list_lock, 0);
6495
6496 #if __arm__ || __arm64__
6497 simple_lock_init(&sched_recommended_cores_lock, 0);
6498 #endif /* __arm__ || __arm64__ */
6499
6500 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
6501 NULL, MAXPRI_KERNEL, &thread);
6502 if (result != KERN_SUCCESS) {
6503 panic("sched_startup");
6504 }
6505
6506 thread_deallocate(thread);
6507
6508 assert_thread_magic(thread);
6509
6510 /*
6511 * Yield to the sched_init_thread once, to
6512 * initialize our own thread after being switched
6513 * back to.
6514 *
6515 * The current thread is the only other thread
6516 * active at this point.
6517 */
6518 thread_block(THREAD_CONTINUE_NULL);
6519 }
6520
6521 #if __arm64__
6522 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6523 #endif /* __arm64__ */
6524
6525
6526 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6527
6528 static volatile uint64_t sched_maintenance_deadline;
6529 static uint64_t sched_tick_last_abstime;
6530 static uint64_t sched_tick_delta;
6531 uint64_t sched_tick_max_delta;
6532
6533
6534 /*
6535 * sched_init_thread:
6536 *
6537 * Perform periodic bookkeeping functions about ten
6538 * times per second.
6539 */
6540 void
sched_timeshare_maintenance_continue(void)6541 sched_timeshare_maintenance_continue(void)
6542 {
6543 uint64_t sched_tick_ctime, late_time;
6544
6545 struct sched_update_scan_context scan_context = {
6546 .earliest_bg_make_runnable_time = UINT64_MAX,
6547 .earliest_normal_make_runnable_time = UINT64_MAX,
6548 .earliest_rt_make_runnable_time = UINT64_MAX
6549 };
6550
6551 sched_tick_ctime = mach_absolute_time();
6552
6553 if (__improbable(sched_tick_last_abstime == 0)) {
6554 sched_tick_last_abstime = sched_tick_ctime;
6555 late_time = 0;
6556 sched_tick_delta = 1;
6557 } else {
6558 late_time = sched_tick_ctime - sched_tick_last_abstime;
6559 sched_tick_delta = late_time / sched_tick_interval;
6560 /* Ensure a delta of 1, since the interval could be slightly
6561 * smaller than the sched_tick_interval due to dispatch
6562 * latencies.
6563 */
6564 sched_tick_delta = MAX(sched_tick_delta, 1);
6565
6566 /* In the event interrupt latencies or platform
6567 * idle events that advanced the timebase resulted
6568 * in periods where no threads were dispatched,
6569 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6570 * iterations.
6571 */
6572 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6573
6574 sched_tick_last_abstime = sched_tick_ctime;
6575 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6576 }
6577
6578 scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6579 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6580 sched_tick_delta, late_time, 0, 0, 0);
6581
6582 /* Add a number of pseudo-ticks corresponding to the elapsed interval
6583 * This could be greater than 1 if substantial intervals where
6584 * all processors are idle occur, which rarely occurs in practice.
6585 */
6586
6587 sched_tick += sched_tick_delta;
6588
6589 update_vm_info();
6590
6591 /*
6592 * Compute various averages.
6593 */
6594 compute_averages(sched_tick_delta);
6595
6596 /*
6597 * Scan the run queues for threads which
6598 * may need to be updated, and find the earliest runnable thread on the runqueue
6599 * to report its latency.
6600 */
6601 SCHED(thread_update_scan)(&scan_context);
6602
6603 SCHED(rt_runq_scan)(&scan_context);
6604
6605 uint64_t ctime = mach_absolute_time();
6606
6607 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6608 ctime - scan_context.earliest_bg_make_runnable_time : 0;
6609
6610 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6611 ctime - scan_context.earliest_normal_make_runnable_time : 0;
6612
6613 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6614 ctime - scan_context.earliest_rt_make_runnable_time : 0;
6615
6616 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6617
6618 /*
6619 * Check to see if the special sched VM group needs attention.
6620 */
6621 sched_vm_group_maintenance();
6622
6623 #if __arm__ || __arm64__
6624 /* Check to see if the recommended cores failsafe is active */
6625 sched_recommended_cores_maintenance();
6626 #endif /* __arm__ || __arm64__ */
6627
6628
6629 #if DEBUG || DEVELOPMENT
6630 #if __x86_64__
6631 #include <i386/misc_protos.h>
6632 /* Check for long-duration interrupts */
6633 mp_interrupt_watchdog();
6634 #endif /* __x86_64__ */
6635 #endif /* DEBUG || DEVELOPMENT */
6636
6637 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6638 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6639 sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6640
6641 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6642 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
6643 /*NOTREACHED*/
6644 }
6645
6646 static uint64_t sched_maintenance_wakeups;
6647
6648 /*
6649 * Determine if the set of routines formerly driven by a maintenance timer
6650 * must be invoked, based on a deadline comparison. Signals the scheduler
6651 * maintenance thread on deadline expiration. Must be invoked at an interval
6652 * lower than the "sched_tick_interval", currently accomplished by
6653 * invocation via the quantum expiration timer and at context switch time.
6654 * Performance matters: this routine reuses a timestamp approximating the
6655 * current absolute time received from the caller, and should perform
6656 * no more than a comparison against the deadline in the common case.
6657 */
6658 void
sched_timeshare_consider_maintenance(uint64_t ctime)6659 sched_timeshare_consider_maintenance(uint64_t ctime)
6660 {
6661 cpu_quiescent_counter_checkin(ctime);
6662
6663 uint64_t deadline = sched_maintenance_deadline;
6664
6665 if (__improbable(ctime >= deadline)) {
6666 if (__improbable(current_thread() == sched_maintenance_thread)) {
6667 return;
6668 }
6669 OSMemoryBarrier();
6670
6671 uint64_t ndeadline = ctime + sched_tick_interval;
6672
6673 if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6674 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
6675 sched_maintenance_wakeups++;
6676 }
6677 }
6678
6679 #if !CONFIG_SCHED_CLUTCH
6680 /*
6681 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
6682 * scheduler, the load is maintained at the thread group and bucket level.
6683 */
6684 uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
6685
6686 if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
6687 uint64_t new_deadline = 0;
6688 if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
6689 compute_sched_load();
6690 new_deadline = ctime + sched_load_compute_interval_abs;
6691 os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
6692 }
6693 }
6694 #endif /* CONFIG_SCHED_CLUTCH */
6695
6696 #if __arm64__
6697 uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
6698
6699 if (__improbable(perf_deadline && ctime >= perf_deadline)) {
6700 /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
6701 if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
6702 machine_perfcontrol_deadline_passed(perf_deadline);
6703 }
6704 }
6705 #endif /* __arm64__ */
6706 }
6707
6708 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6709
6710 void
sched_init_thread(void)6711 sched_init_thread(void)
6712 {
6713 thread_block(THREAD_CONTINUE_NULL);
6714
6715 thread_t thread = current_thread();
6716
6717 thread_set_thread_name(thread, "sched_maintenance_thread");
6718
6719 sched_maintenance_thread = thread;
6720
6721 SCHED(maintenance_continuation)();
6722
6723 /*NOTREACHED*/
6724 }
6725
6726 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6727
6728 /*
6729 * thread_update_scan / runq_scan:
6730 *
6731 * Scan the run queues to account for timesharing threads
6732 * which need to be updated.
6733 *
6734 * Scanner runs in two passes. Pass one squirrels likely
6735 * threads away in an array, pass two does the update.
6736 *
6737 * This is necessary because the run queue is locked for
6738 * the candidate scan, but the thread is locked for the update.
6739 *
6740 * Array should be sized to make forward progress, without
6741 * disabling preemption for long periods.
6742 */
6743
6744 #define THREAD_UPDATE_SIZE 128
6745
6746 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
6747 static uint32_t thread_update_count = 0;
6748
6749 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
6750 boolean_t
thread_update_add_thread(thread_t thread)6751 thread_update_add_thread(thread_t thread)
6752 {
6753 if (thread_update_count == THREAD_UPDATE_SIZE) {
6754 return FALSE;
6755 }
6756
6757 thread_update_array[thread_update_count++] = thread;
6758 thread_reference(thread);
6759 return TRUE;
6760 }
6761
6762 void
thread_update_process_threads(void)6763 thread_update_process_threads(void)
6764 {
6765 assert(thread_update_count <= THREAD_UPDATE_SIZE);
6766
6767 for (uint32_t i = 0; i < thread_update_count; i++) {
6768 thread_t thread = thread_update_array[i];
6769 assert_thread_magic(thread);
6770 thread_update_array[i] = THREAD_NULL;
6771
6772 spl_t s = splsched();
6773 thread_lock(thread);
6774 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
6775 SCHED(update_priority)(thread);
6776 }
6777 thread_unlock(thread);
6778 splx(s);
6779
6780 thread_deallocate(thread);
6781 }
6782
6783 thread_update_count = 0;
6784 }
6785
6786 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)6787 runq_scan_thread(
6788 thread_t thread,
6789 sched_update_scan_context_t scan_context)
6790 {
6791 assert_thread_magic(thread);
6792
6793 if (thread->sched_stamp != sched_tick &&
6794 thread->sched_mode == TH_MODE_TIMESHARE) {
6795 if (thread_update_add_thread(thread) == FALSE) {
6796 return TRUE;
6797 }
6798 }
6799
6800 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
6801 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
6802 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
6803 }
6804 } else {
6805 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
6806 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
6807 }
6808 }
6809
6810 return FALSE;
6811 }
6812
6813 /*
6814 * Scan a runq for candidate threads.
6815 *
6816 * Returns TRUE if retry is needed.
6817 */
6818 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)6819 runq_scan(
6820 run_queue_t runq,
6821 sched_update_scan_context_t scan_context)
6822 {
6823 int count = runq->count;
6824 int queue_index;
6825
6826 assert(count >= 0);
6827
6828 if (count == 0) {
6829 return FALSE;
6830 }
6831
6832 for (queue_index = bitmap_first(runq->bitmap, NRQS);
6833 queue_index >= 0;
6834 queue_index = bitmap_next(runq->bitmap, queue_index)) {
6835 thread_t thread;
6836 circle_queue_t queue = &runq->queues[queue_index];
6837
6838 cqe_foreach_element(thread, queue, runq_links) {
6839 assert(count > 0);
6840 if (runq_scan_thread(thread, scan_context) == TRUE) {
6841 return TRUE;
6842 }
6843 count--;
6844 }
6845 }
6846
6847 return FALSE;
6848 }
6849
6850 #if CONFIG_SCHED_CLUTCH
6851
6852 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)6853 sched_clutch_timeshare_scan(
6854 queue_t thread_queue,
6855 uint16_t thread_count,
6856 sched_update_scan_context_t scan_context)
6857 {
6858 if (thread_count == 0) {
6859 return FALSE;
6860 }
6861
6862 thread_t thread;
6863 qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
6864 if (runq_scan_thread(thread, scan_context) == TRUE) {
6865 return TRUE;
6866 }
6867 thread_count--;
6868 }
6869
6870 assert(thread_count == 0);
6871 return FALSE;
6872 }
6873
6874
6875 #endif /* CONFIG_SCHED_CLUTCH */
6876
6877 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6878
6879 bool
thread_is_eager_preempt(thread_t thread)6880 thread_is_eager_preempt(thread_t thread)
6881 {
6882 return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
6883 }
6884
6885 void
thread_set_eager_preempt(thread_t thread)6886 thread_set_eager_preempt(thread_t thread)
6887 {
6888 spl_t s = splsched();
6889 thread_lock(thread);
6890
6891 assert(!thread_is_eager_preempt(thread));
6892
6893 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
6894
6895 if (thread == current_thread()) {
6896 /* csw_check updates current_is_eagerpreempt on the processor */
6897 ast_t ast = csw_check(thread, current_processor(), AST_NONE);
6898
6899 thread_unlock(thread);
6900
6901 if (ast != AST_NONE) {
6902 thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
6903 }
6904 } else {
6905 processor_t last_processor = thread->last_processor;
6906
6907 if (last_processor != PROCESSOR_NULL &&
6908 last_processor->state == PROCESSOR_RUNNING &&
6909 last_processor->active_thread == thread) {
6910 cause_ast_check(last_processor);
6911 }
6912
6913 thread_unlock(thread);
6914 }
6915
6916 splx(s);
6917 }
6918
6919 void
thread_clear_eager_preempt(thread_t thread)6920 thread_clear_eager_preempt(thread_t thread)
6921 {
6922 spl_t s = splsched();
6923 thread_lock(thread);
6924
6925 assert(thread_is_eager_preempt(thread));
6926
6927 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
6928
6929 if (thread == current_thread()) {
6930 current_processor()->current_is_eagerpreempt = false;
6931 }
6932
6933 thread_unlock(thread);
6934 splx(s);
6935 }
6936
6937 /*
6938 * Scheduling statistics
6939 */
6940 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)6941 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
6942 {
6943 struct sched_statistics *stats;
6944 boolean_t to_realtime = FALSE;
6945
6946 stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
6947 stats->csw_count++;
6948
6949 if (otherpri >= BASEPRI_REALTIME) {
6950 stats->rt_sched_count++;
6951 to_realtime = TRUE;
6952 }
6953
6954 if ((reasons & AST_PREEMPT) != 0) {
6955 stats->preempt_count++;
6956
6957 if (selfpri >= BASEPRI_REALTIME) {
6958 stats->preempted_rt_count++;
6959 }
6960
6961 if (to_realtime) {
6962 stats->preempted_by_rt_count++;
6963 }
6964 }
6965 }
6966
6967 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)6968 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
6969 {
6970 uint64_t timestamp = mach_absolute_time();
6971
6972 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
6973 stats->last_change_timestamp = timestamp;
6974 }
6975
6976 /*
6977 * For calls from assembly code
6978 */
6979 #undef thread_wakeup
6980 void
6981 thread_wakeup(
6982 event_t x);
6983
6984 void
thread_wakeup(event_t x)6985 thread_wakeup(
6986 event_t x)
6987 {
6988 thread_wakeup_with_result(x, THREAD_AWAKENED);
6989 }
6990
6991 boolean_t
preemption_enabled(void)6992 preemption_enabled(void)
6993 {
6994 return get_preemption_level() == 0 && ml_get_interrupts_enabled();
6995 }
6996
6997 static void
sched_timer_deadline_tracking_init(void)6998 sched_timer_deadline_tracking_init(void)
6999 {
7000 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
7001 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
7002 }
7003
7004 #if __arm__ || __arm64__
7005
7006 uint32_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
7007 uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
7008 bool perfcontrol_failsafe_active = false;
7009 bool perfcontrol_sleep_override = false;
7010
7011 uint64_t perfcontrol_failsafe_maintenance_runnable_time;
7012 uint64_t perfcontrol_failsafe_activation_time;
7013 uint64_t perfcontrol_failsafe_deactivation_time;
7014
7015 /* data covering who likely caused it and how long they ran */
7016 #define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */
7017 char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
7018 int perfcontrol_failsafe_pid;
7019 uint64_t perfcontrol_failsafe_tid;
7020 uint64_t perfcontrol_failsafe_thread_timer_at_start;
7021 uint64_t perfcontrol_failsafe_thread_timer_last_seen;
7022 uint32_t perfcontrol_failsafe_recommended_at_trigger;
7023
7024 /*
7025 * Perf controller calls here to update the recommended core bitmask.
7026 * If the failsafe is active, we don't immediately apply the new value.
7027 * Instead, we store the new request and use it after the failsafe deactivates.
7028 *
7029 * If the failsafe is not active, immediately apply the update.
7030 *
7031 * No scheduler locks are held, no other locks are held that scheduler might depend on,
7032 * interrupts are enabled
7033 *
7034 * currently prototype is in osfmk/arm/machine_routines.h
7035 */
7036 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)7037 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
7038 {
7039 assert(preemption_enabled());
7040
7041 spl_t s = splsched();
7042 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7043
7044 perfcontrol_requested_recommended_cores = recommended_cores;
7045 perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores);
7046
7047 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7048 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7049 } else {
7050 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7051 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7052 perfcontrol_requested_recommended_cores,
7053 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7054 }
7055
7056 simple_unlock(&sched_recommended_cores_lock);
7057 splx(s);
7058 }
7059
7060 void
sched_override_recommended_cores_for_sleep(void)7061 sched_override_recommended_cores_for_sleep(void)
7062 {
7063 spl_t s = splsched();
7064 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7065
7066 if (perfcontrol_sleep_override == false) {
7067 perfcontrol_sleep_override = true;
7068 sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
7069 }
7070
7071 simple_unlock(&sched_recommended_cores_lock);
7072 splx(s);
7073 }
7074
7075 void
sched_restore_recommended_cores_after_sleep(void)7076 sched_restore_recommended_cores_after_sleep(void)
7077 {
7078 spl_t s = splsched();
7079 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7080
7081 if (perfcontrol_sleep_override == true) {
7082 perfcontrol_sleep_override = false;
7083 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7084 }
7085
7086 simple_unlock(&sched_recommended_cores_lock);
7087 splx(s);
7088 }
7089
7090 /*
7091 * Consider whether we need to activate the recommended cores failsafe
7092 *
7093 * Called from quantum timer interrupt context of a realtime thread
7094 * No scheduler locks are held, interrupts are disabled
7095 */
7096 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)7097 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7098 {
7099 /*
7100 * Check if a realtime thread is starving the system
7101 * and bringing up non-recommended cores would help
7102 *
7103 * TODO: Is this the correct check for recommended == possible cores?
7104 * TODO: Validate the checks without the relevant lock are OK.
7105 */
7106
7107 if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7108 /* keep track of how long the responsible thread runs */
7109
7110 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7111
7112 if (perfcontrol_failsafe_active == TRUE &&
7113 cur_thread->thread_id == perfcontrol_failsafe_tid) {
7114 perfcontrol_failsafe_thread_timer_last_seen = timer_grab(&cur_thread->user_timer) +
7115 timer_grab(&cur_thread->system_timer);
7116 }
7117
7118 simple_unlock(&sched_recommended_cores_lock);
7119
7120 /* we're already trying to solve the problem, so bail */
7121 return;
7122 }
7123
7124 /* The failsafe won't help if there are no more processors to enable */
7125 if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7126 return;
7127 }
7128
7129 uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7130
7131 /* Use the maintenance thread as our canary in the coal mine */
7132 thread_t m_thread = sched_maintenance_thread;
7133
7134 /* If it doesn't look bad, nothing to see here */
7135 if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7136 return;
7137 }
7138
7139 /* It looks bad, take the lock to be sure */
7140 thread_lock(m_thread);
7141
7142 if (m_thread->runq == PROCESSOR_NULL ||
7143 (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
7144 m_thread->last_made_runnable_time >= too_long_ago) {
7145 /*
7146 * Maintenance thread is either on cpu or blocked, and
7147 * therefore wouldn't benefit from more cores
7148 */
7149 thread_unlock(m_thread);
7150 return;
7151 }
7152
7153 uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7154
7155 thread_unlock(m_thread);
7156
7157 /*
7158 * There are cores disabled at perfcontrol's recommendation, but the
7159 * system is so overloaded that the maintenance thread can't run.
7160 * That likely means that perfcontrol can't run either, so it can't fix
7161 * the recommendation. We have to kick in a failsafe to keep from starving.
7162 *
7163 * When the maintenance thread has been starved for too long,
7164 * ignore the recommendation from perfcontrol and light up all the cores.
7165 *
7166 * TODO: Consider weird states like boot, sleep, or debugger
7167 */
7168
7169 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7170
7171 if (perfcontrol_failsafe_active == TRUE) {
7172 simple_unlock(&sched_recommended_cores_lock);
7173 return;
7174 }
7175
7176 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7177 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
7178 perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
7179
7180 perfcontrol_failsafe_active = TRUE;
7181 perfcontrol_failsafe_activation_time = mach_absolute_time();
7182 perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7183 perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7184
7185 /* Capture some data about who screwed up (assuming that the thread on core is at fault) */
7186 task_t task = get_threadtask(cur_thread);
7187 perfcontrol_failsafe_pid = task_pid(task);
7188 strlcpy(perfcontrol_failsafe_name, proc_name_address(task->bsd_info), sizeof(perfcontrol_failsafe_name));
7189
7190 perfcontrol_failsafe_tid = cur_thread->thread_id;
7191
7192 /* Blame the thread for time it has run recently */
7193 uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7194
7195 uint64_t last_seen = timer_grab(&cur_thread->user_timer) + timer_grab(&cur_thread->system_timer);
7196
7197 /* Compute the start time of the bad behavior in terms of the thread's on core time */
7198 perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
7199 perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7200
7201 /* Ignore the previously recommended core configuration */
7202 sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
7203
7204 simple_unlock(&sched_recommended_cores_lock);
7205 }
7206
7207 /*
7208 * Now that our bacon has been saved by the failsafe, consider whether to turn it off
7209 *
7210 * Runs in the context of the maintenance thread, no locks held
7211 */
7212 static void
sched_recommended_cores_maintenance(void)7213 sched_recommended_cores_maintenance(void)
7214 {
7215 /* Common case - no failsafe, nothing to be done here */
7216 if (__probable(perfcontrol_failsafe_active == FALSE)) {
7217 return;
7218 }
7219
7220 uint64_t ctime = mach_absolute_time();
7221
7222 boolean_t print_diagnostic = FALSE;
7223 char p_name[FAILSAFE_NAME_LEN] = "";
7224
7225 spl_t s = splsched();
7226 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7227
7228 /* Check again, under the lock, to avoid races */
7229 if (perfcontrol_failsafe_active == FALSE) {
7230 goto out;
7231 }
7232
7233 /*
7234 * Ensure that the other cores get another few ticks to run some threads
7235 * If we don't have this hysteresis, the maintenance thread is the first
7236 * to run, and then it immediately kills the other cores
7237 */
7238 if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7239 goto out;
7240 }
7241
7242 /* Capture some diagnostic state under the lock so we can print it out later */
7243
7244 int pid = perfcontrol_failsafe_pid;
7245 uint64_t tid = perfcontrol_failsafe_tid;
7246
7247 uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
7248 perfcontrol_failsafe_thread_timer_at_start;
7249 uint32_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
7250 uint32_t rec_cores_after = perfcontrol_requested_recommended_cores;
7251 uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
7252 strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
7253
7254 print_diagnostic = TRUE;
7255
7256 /* Deactivate the failsafe and reinstate the requested recommendation settings */
7257
7258 perfcontrol_failsafe_deactivation_time = ctime;
7259 perfcontrol_failsafe_active = FALSE;
7260
7261 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7262 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
7263 perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
7264
7265 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7266
7267 out:
7268 simple_unlock(&sched_recommended_cores_lock);
7269 splx(s);
7270
7271 if (print_diagnostic) {
7272 uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
7273
7274 absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
7275 failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7276
7277 absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
7278 thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7279
7280 printf("recommended core failsafe kicked in for %lld ms "
7281 "likely due to %s[%d] thread 0x%llx spending "
7282 "%lld ms on cpu at realtime priority - "
7283 "new recommendation: 0x%x -> 0x%x\n",
7284 failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7285 rec_cores_before, rec_cores_after);
7286 }
7287 }
7288
7289 #endif /* __arm__ || __arm64__ */
7290
7291 kern_return_t
sched_processor_enable(processor_t processor,boolean_t enable)7292 sched_processor_enable(processor_t processor, boolean_t enable)
7293 {
7294 assert(preemption_enabled());
7295
7296 spl_t s = splsched();
7297 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7298
7299 if (enable) {
7300 bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7301 } else {
7302 bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7303 }
7304
7305 #if __arm__ || __arm64__
7306 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7307 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7308 } else {
7309 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7310 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7311 perfcontrol_requested_recommended_cores,
7312 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7313 }
7314 #else /* __arm__ || __arm64__ */
7315 sched_update_recommended_cores(usercontrol_requested_recommended_cores);
7316 #endif /* !__arm__ || __arm64__ */
7317
7318 simple_unlock(&sched_recommended_cores_lock);
7319 splx(s);
7320
7321 return KERN_SUCCESS;
7322 }
7323
7324
7325 /*
7326 * Apply a new recommended cores mask to the processors it affects
7327 * Runs after considering failsafes and such
7328 *
7329 * Iterate over processors and update their ->is_recommended field.
7330 * If a processor is running, we let it drain out at its next
7331 * quantum expiration or blocking point. If a processor is idle, there
7332 * may be more work for it to do, so IPI it.
7333 *
7334 * interrupts disabled, sched_recommended_cores_lock is held
7335 */
7336 static void
sched_update_recommended_cores(uint64_t recommended_cores)7337 sched_update_recommended_cores(uint64_t recommended_cores)
7338 {
7339 uint64_t needs_exit_idle_mask = 0x0;
7340
7341 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
7342 recommended_cores,
7343 #if __arm__ || __arm64__
7344 perfcontrol_failsafe_active, 0, 0);
7345 #else /* __arm__ || __arm64__ */
7346 0, 0, 0);
7347 #endif /* ! __arm__ || __arm64__ */
7348
7349 if (__builtin_popcountll(recommended_cores) == 0) {
7350 bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
7351 }
7352
7353 /* First set recommended cores */
7354 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7355 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7356 processor_set_t pset = pset_array[pset_id];
7357
7358 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7359 cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7360
7361 if (newly_recommended == 0) {
7362 /* Nothing to do */
7363 continue;
7364 }
7365
7366 pset_lock(pset);
7367
7368 for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0; cpu_id = lsb_next(newly_recommended, cpu_id)) {
7369 processor_t processor = processor_array[cpu_id];
7370 processor->is_recommended = TRUE;
7371 bit_set(pset->recommended_bitmask, processor->cpu_id);
7372
7373 if (processor->state == PROCESSOR_IDLE) {
7374 if (processor != current_processor()) {
7375 bit_set(needs_exit_idle_mask, processor->cpu_id);
7376 }
7377 }
7378 if (processor->state != PROCESSOR_OFF_LINE) {
7379 os_atomic_inc(&processor_avail_count_user, relaxed);
7380 if (processor->processor_primary == processor) {
7381 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
7382 }
7383 SCHED(pset_made_schedulable)(processor, pset, false);
7384 }
7385 }
7386 pset_update_rt_stealable_state(pset);
7387
7388 pset_unlock(pset);
7389 }
7390 }
7391
7392 /* Now shutdown not recommended cores */
7393 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7394 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7395 processor_set_t pset = pset_array[pset_id];
7396
7397 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7398 cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
7399
7400 if (newly_unrecommended == 0) {
7401 /* Nothing to do */
7402 continue;
7403 }
7404
7405 pset_lock(pset);
7406
7407 for (int cpu_id = lsb_first(newly_unrecommended); cpu_id >= 0; cpu_id = lsb_next(newly_unrecommended, cpu_id)) {
7408 processor_t processor = processor_array[cpu_id];
7409 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
7410
7411 processor->is_recommended = FALSE;
7412 bit_clear(pset->recommended_bitmask, processor->cpu_id);
7413 if (processor->state != PROCESSOR_OFF_LINE) {
7414 os_atomic_dec(&processor_avail_count_user, relaxed);
7415 if (processor->processor_primary == processor) {
7416 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
7417 }
7418 }
7419 pset_update_rt_stealable_state(pset);
7420
7421 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
7422 ipi_type = SCHED_IPI_IMMEDIATE;
7423 }
7424 SCHED(processor_queue_shutdown)(processor);
7425 /* pset unlocked */
7426
7427 SCHED(rt_queue_shutdown)(processor);
7428
7429 if (ipi_type != SCHED_IPI_NONE) {
7430 if (processor == current_processor()) {
7431 ast_on(AST_PREEMPT);
7432 } else {
7433 sched_ipi_perform(processor, ipi_type);
7434 }
7435 }
7436
7437 pset_lock(pset);
7438 }
7439 pset_unlock(pset);
7440 }
7441 }
7442
7443 #if defined(__x86_64__)
7444 commpage_update_active_cpus();
7445 #endif
7446 /* Issue all pending IPIs now that the pset lock has been dropped */
7447 for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
7448 processor_t processor = processor_array[cpuid];
7449 machine_signal_idle(processor);
7450 }
7451
7452 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
7453 needs_exit_idle_mask, 0, 0, 0);
7454 }
7455
7456 void
thread_set_options(uint32_t thopt)7457 thread_set_options(uint32_t thopt)
7458 {
7459 spl_t x;
7460 thread_t t = current_thread();
7461
7462 x = splsched();
7463 thread_lock(t);
7464
7465 t->options |= thopt;
7466
7467 thread_unlock(t);
7468 splx(x);
7469 }
7470
7471 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)7472 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
7473 {
7474 thread->pending_block_hint = block_hint;
7475 }
7476
7477 uint32_t
qos_max_parallelism(int qos,uint64_t options)7478 qos_max_parallelism(int qos, uint64_t options)
7479 {
7480 return SCHED(qos_max_parallelism)(qos, options);
7481 }
7482
7483 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)7484 sched_qos_max_parallelism(__unused int qos, uint64_t options)
7485 {
7486 host_basic_info_data_t hinfo;
7487 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
7488
7489
7490 /*
7491 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
7492 * implement their own qos_max_parallelism() interfaces.
7493 */
7494 assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
7495
7496 /* Query the machine layer for core information */
7497 __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
7498 (host_info_t)&hinfo, &count);
7499 assert(kret == KERN_SUCCESS);
7500
7501 if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
7502 return hinfo.logical_cpu;
7503 } else {
7504 return hinfo.physical_cpu;
7505 }
7506 }
7507
7508 int sched_allow_NO_SMT_threads = 1;
7509 bool
thread_no_smt(thread_t thread)7510 thread_no_smt(thread_t thread)
7511 {
7512 return sched_allow_NO_SMT_threads &&
7513 (thread->bound_processor == PROCESSOR_NULL) &&
7514 ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
7515 }
7516
7517 bool
processor_active_thread_no_smt(processor_t processor)7518 processor_active_thread_no_smt(processor_t processor)
7519 {
7520 return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
7521 }
7522
7523 #if __arm64__
7524
7525 /*
7526 * Set up or replace old timer with new timer
7527 *
7528 * Returns true if canceled old timer, false if it did not
7529 */
7530 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)7531 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
7532 {
7533 /*
7534 * Exchange deadline for new deadline, if old deadline was nonzero,
7535 * then I cancelled the callback, otherwise I didn't
7536 */
7537
7538 return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
7539 relaxed) != 0;
7540 }
7541
7542 #endif /* __arm64__ */
7543
7544 #if CONFIG_SCHED_EDGE
7545
7546 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
7547
7548 /*
7549 * sched_edge_pset_running_higher_bucket()
7550 *
7551 * Routine to calculate cumulative running counts for each scheduling
7552 * bucket. This effectively lets the load calculation calculate if a
7553 * cluster is running any threads at a QoS lower than the thread being
7554 * migrated etc.
7555 */
7556
7557 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)7558 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
7559 {
7560 bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
7561
7562 /* Edge Scheduler Optimization */
7563 for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
7564 sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
7565 for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
7566 running_higher[bucket]++;
7567 }
7568 }
7569 }
7570
7571 /*
7572 * sched_update_pset_load_average()
7573 *
7574 * Updates the load average for each sched bucket for a cluster.
7575 * This routine must be called with the pset lock held.
7576 */
7577 void
sched_update_pset_load_average(processor_set_t pset,uint64_t curtime)7578 sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
7579 {
7580 int avail_cpu_count = pset_available_cpu_count(pset);
7581 if (avail_cpu_count == 0) {
7582 /* Looks like the pset is not runnable any more; nothing to do here */
7583 return;
7584 }
7585
7586 /*
7587 * Edge Scheduler Optimization
7588 *
7589 * See if more callers of this routine can pass in timestamps to avoid the
7590 * mach_absolute_time() call here.
7591 */
7592
7593 if (!curtime) {
7594 curtime = mach_absolute_time();
7595 }
7596 uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
7597 int64_t delta_ticks = curtime - last_update;
7598 if (delta_ticks < 0) {
7599 return;
7600 }
7601
7602 uint64_t delta_nsecs = 0;
7603 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
7604
7605 if (__improbable(delta_nsecs > UINT32_MAX)) {
7606 delta_nsecs = UINT32_MAX;
7607 }
7608
7609 #if CONFIG_SCHED_EDGE
7610 /* Update the shared resource load on the pset */
7611 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
7612 uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
7613 uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
7614 uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
7615 uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
7616 if (old_shared_load != new_shared_load) {
7617 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
7618 }
7619 }
7620 #endif /* CONFIG_SCHED_EDGE */
7621
7622 uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
7623 sched_edge_pset_running_higher_bucket(pset, running_higher);
7624
7625 for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
7626 uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
7627 uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
7628 uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
7629
7630 /*
7631 * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
7632 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
7633 * new load averga needs to be shifted before it can be added to the old load average.
7634 */
7635 uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
7636
7637 /*
7638 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
7639 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
7640 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
7641 */
7642 int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
7643 boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
7644 boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
7645 uint64_t load_average;
7646 if (load_uptick || load_downtick) {
7647 load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
7648 } else {
7649 /* Indicates a loaded system; use EWMA for load average calculation */
7650 load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
7651 }
7652 os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
7653 if (load_average != old_load_average) {
7654 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
7655 }
7656 }
7657 os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
7658 }
7659
7660 void
sched_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)7661 sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
7662 {
7663 pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
7664 uint64_t avg_thread_execution_time = 0;
7665
7666 os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
7667 old_execution_time_packed.pset_execution_time_packed,
7668 new_execution_time_packed.pset_execution_time_packed, relaxed, {
7669 uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
7670 int64_t delta_ticks = curtime - last_update;
7671 if (delta_ticks < 0) {
7672 /*
7673 * Its possible that another CPU came in and updated the pset_execution_time
7674 * before this CPU could do it. Since the average execution time is meant to
7675 * be an approximate measure per cluster, ignore the older update.
7676 */
7677 os_atomic_rmw_loop_give_up(return );
7678 }
7679 uint64_t delta_nsecs = 0;
7680 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
7681
7682 uint64_t nanotime = 0;
7683 absolutetime_to_nanoseconds(execution_time, &nanotime);
7684 uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
7685
7686 uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
7687 uint64_t new_execution_time = (execution_time_us * delta_nsecs);
7688
7689 avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
7690 new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
7691 new_execution_time_packed.pset_execution_time_last_update = curtime;
7692 });
7693 if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
7694 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
7695 }
7696 }
7697
7698 uint64_t
sched_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)7699 sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
7700 {
7701 return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
7702 }
7703
7704 #else /* CONFIG_SCHED_EDGE */
7705
7706 void
sched_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)7707 sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
7708 {
7709 int non_rt_load = pset->pset_runq.count;
7710 int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
7711 int new_load_average = ((int)pset->load_average + load) >> 1;
7712
7713 pset->load_average = new_load_average;
7714 #if (DEVELOPMENT || DEBUG)
7715 #if __AMP__
7716 if (pset->pset_cluster_type == PSET_AMP_P) {
7717 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
7718 }
7719 #endif
7720 #endif
7721 }
7722
7723 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)7724 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
7725 {
7726 }
7727
7728 #endif /* CONFIG_SCHED_EDGE */
7729
7730 /* pset is locked */
7731 static bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)7732 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
7733 {
7734 int cpuid = processor->cpu_id;
7735 #if defined(__x86_64__)
7736 if (sched_avoid_cpu0 && (cpuid == 0)) {
7737 return false;
7738 }
7739 #endif
7740
7741 cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
7742
7743 return bit_test(fasttrack_map, cpuid);
7744 }
7745
7746 /* pset is locked */
7747 static processor_t
choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)7748 choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
7749 {
7750 #if defined(__x86_64__)
7751 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
7752 #else
7753 const bool avoid_cpu0 = false;
7754 #endif
7755 cpumap_t cpu_map;
7756
7757 try_again:
7758 cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
7759 if (skip_processor) {
7760 bit_clear(cpu_map, skip_processor->cpu_id);
7761 }
7762 if (skip_spills) {
7763 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
7764 }
7765
7766 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7767 bit_clear(cpu_map, 0);
7768 }
7769
7770 cpumap_t primary_map = cpu_map & pset->primary_map;
7771 if (avoid_cpu0) {
7772 primary_map = bit_ror64(primary_map, 1);
7773 }
7774
7775 int rotid = lsb_first(primary_map);
7776 if (rotid >= 0) {
7777 int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
7778
7779 processor_t processor = processor_array[cpuid];
7780
7781 return processor;
7782 }
7783
7784 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
7785 goto out;
7786 }
7787
7788 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7789 /* Also avoid cpu1 */
7790 bit_clear(cpu_map, 1);
7791 }
7792
7793 /* Consider secondary processors whose primary is actually running a realtime thread */
7794 cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
7795 if (avoid_cpu0) {
7796 /* Also avoid cpu1 */
7797 secondary_map = bit_ror64(secondary_map, 2);
7798 }
7799 rotid = lsb_first(secondary_map);
7800 if (rotid >= 0) {
7801 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
7802
7803 processor_t processor = processor_array[cpuid];
7804
7805 return processor;
7806 }
7807
7808 /* Consider secondary processors */
7809 secondary_map = cpu_map & ~pset->primary_map;
7810 if (avoid_cpu0) {
7811 /* Also avoid cpu1 */
7812 secondary_map = bit_ror64(secondary_map, 2);
7813 }
7814 rotid = lsb_first(secondary_map);
7815 if (rotid >= 0) {
7816 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
7817
7818 processor_t processor = processor_array[cpuid];
7819
7820 return processor;
7821 }
7822
7823 /*
7824 * I was hoping the compiler would optimize
7825 * this away when avoid_cpu0 is const bool false
7826 * but it still complains about the assignmnent
7827 * in that case.
7828 */
7829 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7830 #if defined(__x86_64__)
7831 avoid_cpu0 = false;
7832 #else
7833 assert(0);
7834 #endif
7835 goto try_again;
7836 }
7837
7838 out:
7839 if (skip_processor) {
7840 return PROCESSOR_NULL;
7841 }
7842
7843 /*
7844 * If we didn't find an obvious processor to choose, but there are still more CPUs
7845 * not already running realtime threads than realtime threads in the realtime run queue,
7846 * this thread belongs in this pset, so choose some other processor in this pset
7847 * to ensure the thread is enqueued here.
7848 */
7849 cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
7850 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
7851 cpu_map = non_realtime_map;
7852 assert(cpu_map != 0);
7853 int cpuid = bit_first(cpu_map);
7854 assert(cpuid >= 0);
7855 return processor_array[cpuid];
7856 }
7857
7858 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
7859 goto skip_secondaries;
7860 }
7861
7862 non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
7863 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
7864 cpu_map = non_realtime_map;
7865 assert(cpu_map != 0);
7866 int cpuid = bit_first(cpu_map);
7867 assert(cpuid >= 0);
7868 return processor_array[cpuid];
7869 }
7870
7871 skip_secondaries:
7872 return PROCESSOR_NULL;
7873 }
7874
7875 /* pset is locked */
7876 static processor_t
choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills)7877 choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills)
7878 {
7879 uint64_t furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon);
7880 processor_t fd_processor = PROCESSOR_NULL;
7881 processor_t lopri_processor = PROCESSOR_NULL;
7882 int lowest_priority = max_pri;
7883
7884 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
7885 if (skip_processor) {
7886 bit_clear(cpu_map, skip_processor->cpu_id);
7887 }
7888 if (skip_spills) {
7889 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
7890 }
7891
7892 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
7893 processor_t processor = processor_array[cpuid];
7894
7895 if (processor->current_pri > max_pri) {
7896 continue;
7897 }
7898
7899 if (processor->current_pri < lowest_priority) {
7900 lowest_priority = processor->current_pri;
7901 lopri_processor = processor;
7902 continue;
7903 }
7904
7905 if (processor->deadline > furthest_deadline) {
7906 furthest_deadline = processor->deadline;
7907 fd_processor = processor;
7908 }
7909 }
7910
7911 if (lopri_processor) {
7912 if (sched_rt_runq_strict_priority) {
7913 return lopri_processor;
7914 }
7915
7916 if (lopri_processor->deadline > furthest_deadline) {
7917 /* NRG maybe also consider the computations if possible */
7918 return lopri_processor;
7919 }
7920
7921 return PROCESSOR_NULL;
7922 }
7923
7924 return fd_processor;
7925 }
7926
7927 /* pset is locked */
7928 static processor_t
choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)7929 choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
7930 {
7931 bool skip_spills = true;
7932
7933 processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
7934 if (next_processor != PROCESSOR_NULL) {
7935 return next_processor;
7936 }
7937
7938 next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills);
7939 return next_processor;
7940 }
7941
7942 #if defined(__x86_64__)
7943 /* pset is locked */
7944 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)7945 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
7946 {
7947 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
7948 int nbackup_cpus = 0;
7949
7950 if (include_backups && rt_runq_is_low_latency(pset)) {
7951 nbackup_cpus = sched_rt_n_backup_processors;
7952 }
7953
7954 cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
7955 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7956 bit_clear(cpu_map, 0);
7957 }
7958 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
7959 }
7960
7961 /* pset is locked */
7962 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)7963 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
7964 {
7965 int nbackup_cpus = 0;
7966
7967 if (include_backups && rt_runq_is_low_latency(pset)) {
7968 nbackup_cpus = sched_rt_n_backup_processors;
7969 }
7970
7971 cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
7972 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
7973 }
7974 #endif
7975
7976 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)7977 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
7978 {
7979 if (!processor->is_recommended) {
7980 return false;
7981 }
7982 bool ok_to_run_realtime_thread = true;
7983 #if defined(__x86_64__)
7984 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
7985 if (spill_pending) {
7986 return true;
7987 }
7988 if (processor->cpu_id == 0) {
7989 if (sched_avoid_cpu0 == 1) {
7990 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
7991 } else if (sched_avoid_cpu0 == 2) {
7992 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
7993 }
7994 } else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
7995 ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
7996 } else if (processor->processor_primary != processor) {
7997 ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
7998 }
7999 #else
8000 (void)pset;
8001 (void)processor;
8002 (void)as_backup;
8003 #endif
8004 return ok_to_run_realtime_thread;
8005 }
8006
8007 void
sched_pset_made_schedulable(__unused processor_t processor,processor_set_t pset,boolean_t drop_lock)8008 sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
8009 {
8010 if (drop_lock) {
8011 pset_unlock(pset);
8012 }
8013 }
8014
8015 void
thread_set_no_smt(bool set)8016 thread_set_no_smt(bool set)
8017 {
8018 if (!system_is_SMT) {
8019 /* Not a machine that supports SMT */
8020 return;
8021 }
8022
8023 thread_t thread = current_thread();
8024
8025 spl_t s = splsched();
8026 thread_lock(thread);
8027 if (set) {
8028 thread->sched_flags |= TH_SFLAG_NO_SMT;
8029 }
8030 thread_unlock(thread);
8031 splx(s);
8032 }
8033
8034 bool
thread_get_no_smt(void)8035 thread_get_no_smt(void)
8036 {
8037 return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8038 }
8039
8040 extern void task_set_no_smt(task_t);
8041 void
task_set_no_smt(task_t task)8042 task_set_no_smt(task_t task)
8043 {
8044 if (!system_is_SMT) {
8045 /* Not a machine that supports SMT */
8046 return;
8047 }
8048
8049 if (task == TASK_NULL) {
8050 task = current_task();
8051 }
8052
8053 task_lock(task);
8054 task->t_flags |= TF_NO_SMT;
8055 task_unlock(task);
8056 }
8057
8058 #if DEBUG || DEVELOPMENT
8059 extern void sysctl_task_set_no_smt(char no_smt);
8060 void
sysctl_task_set_no_smt(char no_smt)8061 sysctl_task_set_no_smt(char no_smt)
8062 {
8063 if (!system_is_SMT) {
8064 /* Not a machine that supports SMT */
8065 return;
8066 }
8067
8068 task_t task = current_task();
8069
8070 task_lock(task);
8071 if (no_smt == '1') {
8072 task->t_flags |= TF_NO_SMT;
8073 }
8074 task_unlock(task);
8075 }
8076
8077 extern char sysctl_task_get_no_smt(void);
8078 char
sysctl_task_get_no_smt(void)8079 sysctl_task_get_no_smt(void)
8080 {
8081 task_t task = current_task();
8082
8083 if (task->t_flags & TF_NO_SMT) {
8084 return '1';
8085 }
8086 return '0';
8087 }
8088 #endif /* DEVELOPMENT || DEBUG */
8089
8090
8091 __private_extern__ void
thread_bind_cluster_type(thread_t thread,char cluster_type,bool soft_bound)8092 thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8093 {
8094 #if __AMP__
8095 spl_t s = splsched();
8096 thread_lock(thread);
8097 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8098 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8099 if (soft_bound) {
8100 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8101 }
8102 switch (cluster_type) {
8103 case 'e':
8104 case 'E':
8105 if (pset0.pset_cluster_type == PSET_AMP_E) {
8106 thread->th_bound_cluster_id = pset0.pset_id;
8107 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8108 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8109 }
8110 break;
8111 case 'p':
8112 case 'P':
8113 if (pset0.pset_cluster_type == PSET_AMP_P) {
8114 thread->th_bound_cluster_id = pset0.pset_id;
8115 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8116 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8117 }
8118 break;
8119 default:
8120 break;
8121 }
8122 thread_unlock(thread);
8123 splx(s);
8124
8125 if (thread == current_thread()) {
8126 thread_block(THREAD_CONTINUE_NULL);
8127 }
8128 #else /* __AMP__ */
8129 (void)thread;
8130 (void)cluster_type;
8131 (void)soft_bound;
8132 #endif /* __AMP__ */
8133 }
8134
8135 extern uint32_t thread_bound_cluster_id(thread_t thread);
8136 uint32_t
thread_bound_cluster_id(thread_t thread)8137 thread_bound_cluster_id(thread_t thread)
8138 {
8139 return thread->th_bound_cluster_id;
8140 }
8141
8142 __private_extern__ kern_return_t
thread_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)8143 thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8144 {
8145 #if __AMP__
8146
8147 processor_set_t pset = NULL;
8148 if (options & (THREAD_BIND_SOFT | THREAD_BIND_ELIGIBLE_ONLY)) {
8149 /* Validate the inputs for the bind case */
8150 int max_clusters = ml_get_cluster_count();
8151 if (cluster_id >= max_clusters) {
8152 /* Invalid cluster id */
8153 return KERN_INVALID_ARGUMENT;
8154 }
8155 pset = pset_array[cluster_id];
8156 if (pset == NULL) {
8157 /* Cluster has not been initialized yet */
8158 return KERN_INVALID_ARGUMENT;
8159 }
8160 if (options & THREAD_BIND_ELIGIBLE_ONLY) {
8161 if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
8162 /* Thread is not recommended for the cluster type */
8163 return KERN_INVALID_POLICY;
8164 }
8165 }
8166 }
8167
8168 if (options & THREAD_UNBIND) {
8169 /* If the thread was actually not bound to some cluster, nothing to do here */
8170 if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8171 return KERN_SUCCESS;
8172 }
8173 }
8174
8175 spl_t s = splsched();
8176 thread_lock(thread);
8177
8178 /* Unbind the thread from its previous bound state */
8179 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8180 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8181
8182 if (options & THREAD_UNBIND) {
8183 /* Nothing more to do here */
8184 goto thread_bind_cluster_complete;
8185 }
8186
8187 if (options & THREAD_BIND_SOFT) {
8188 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8189 }
8190 thread->th_bound_cluster_id = cluster_id;
8191
8192 thread_bind_cluster_complete:
8193 thread_unlock(thread);
8194 splx(s);
8195
8196 if (thread == current_thread()) {
8197 thread_block(THREAD_CONTINUE_NULL);
8198 }
8199 #else /* __AMP__ */
8200 (void)thread;
8201 (void)cluster_id;
8202 (void)options;
8203 #endif /* __AMP__ */
8204 return KERN_SUCCESS;
8205 }
8206
8207 #if DEVELOPMENT || DEBUG
8208 extern int32_t sysctl_get_bound_cpuid(void);
8209 int32_t
sysctl_get_bound_cpuid(void)8210 sysctl_get_bound_cpuid(void)
8211 {
8212 int32_t cpuid = -1;
8213 thread_t self = current_thread();
8214
8215 processor_t processor = self->bound_processor;
8216 if (processor == NULL) {
8217 cpuid = -1;
8218 } else {
8219 cpuid = processor->cpu_id;
8220 }
8221
8222 return cpuid;
8223 }
8224
8225 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
8226 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)8227 sysctl_thread_bind_cpuid(int32_t cpuid)
8228 {
8229 processor_t processor = PROCESSOR_NULL;
8230
8231 if (cpuid == -1) {
8232 goto unbind;
8233 }
8234
8235 if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
8236 return KERN_INVALID_VALUE;
8237 }
8238
8239 processor = processor_array[cpuid];
8240 if (processor == PROCESSOR_NULL) {
8241 return KERN_INVALID_VALUE;
8242 }
8243
8244 #if __AMP__
8245
8246 thread_t thread = current_thread();
8247
8248 if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
8249 if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
8250 /* Cannot hard-bind an already hard-cluster-bound thread */
8251 return KERN_NOT_SUPPORTED;
8252 }
8253 }
8254
8255 #endif /* __AMP__ */
8256
8257 unbind:
8258 thread_bind(processor);
8259
8260 thread_block(THREAD_CONTINUE_NULL);
8261 return KERN_SUCCESS;
8262 }
8263
8264 extern char sysctl_get_task_cluster_type(void);
8265 char
sysctl_get_task_cluster_type(void)8266 sysctl_get_task_cluster_type(void)
8267 {
8268 task_t task = current_task();
8269 processor_set_t pset_hint = task->pset_hint;
8270
8271 if (!pset_hint) {
8272 return '0';
8273 }
8274
8275 #if __AMP__
8276 if (pset_hint->pset_cluster_type == PSET_AMP_E) {
8277 return 'E';
8278 } else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
8279 return 'P';
8280 }
8281 #endif
8282
8283 return '0';
8284 }
8285
8286 #if __AMP__
8287 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)8288 find_pset_of_type(pset_cluster_type_t t)
8289 {
8290 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8291 if (node->pset_cluster_type != t) {
8292 continue;
8293 }
8294
8295 processor_set_t pset = PROCESSOR_SET_NULL;
8296 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8297 pset = pset_array[pset_id];
8298 /* Prefer one with recommended processsors */
8299 if (pset->recommended_bitmask != 0) {
8300 assert(pset->pset_cluster_type == t);
8301 return pset;
8302 }
8303 }
8304 /* Otherwise return whatever was found last */
8305 return pset;
8306 }
8307
8308 return PROCESSOR_SET_NULL;
8309 }
8310 #endif
8311
8312 extern void sysctl_task_set_cluster_type(char cluster_type);
8313 void
sysctl_task_set_cluster_type(char cluster_type)8314 sysctl_task_set_cluster_type(char cluster_type)
8315 {
8316 task_t task = current_task();
8317 processor_set_t pset_hint = PROCESSOR_SET_NULL;
8318
8319 #if __AMP__
8320 switch (cluster_type) {
8321 case 'e':
8322 case 'E':
8323 pset_hint = find_pset_of_type(PSET_AMP_E);
8324 break;
8325 case 'p':
8326 case 'P':
8327 pset_hint = find_pset_of_type(PSET_AMP_P);
8328 break;
8329 default:
8330 break;
8331 }
8332
8333 if (pset_hint) {
8334 task_lock(task);
8335 task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
8336 task->pset_hint = pset_hint;
8337 task_unlock(task);
8338
8339 thread_block(THREAD_CONTINUE_NULL);
8340 }
8341 #else
8342 (void)cluster_type;
8343 (void)task;
8344 (void)pset_hint;
8345 #endif
8346 }
8347
8348 #endif /* DEVELOPMENT || DEBUG */
8349