1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_FREE_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67 #include <debug.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80
81 #include <machine/commpage.h>
82
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/debug.h>
90 #include <kern/macro_help.h>
91 #include <kern/machine.h>
92 #include <kern/misc_protos.h>
93 #if MONOTONIC
94 #include <kern/monotonic.h>
95 #endif /* MONOTONIC */
96 #include <kern/processor.h>
97 #include <kern/queue.h>
98 #include <kern/sched.h>
99 #include <kern/sched_prim.h>
100 #include <kern/sfi.h>
101 #include <kern/syscall_subr.h>
102 #include <kern/task.h>
103 #include <kern/thread.h>
104 #include <kern/thread_group.h>
105 #include <kern/ledger.h>
106 #include <kern/timer_queue.h>
107 #include <kern/waitq.h>
108 #include <kern/policy_internal.h>
109 #include <kern/cpu_quiesce.h>
110
111 #include <vm/pmap.h>
112 #include <vm/vm_kern.h>
113 #include <vm/vm_map.h>
114 #include <vm/vm_pageout.h>
115
116 #include <mach/sdt.h>
117 #include <mach/mach_host.h>
118 #include <mach/host_info.h>
119
120 #include <sys/kdebug.h>
121 #include <kperf/kperf.h>
122 #include <kern/kpc.h>
123 #include <san/kasan.h>
124 #include <kern/pms.h>
125 #include <kern/host.h>
126 #include <stdatomic.h>
127
128 struct ast_gen_pair {
129 os_atomic(ast_gen_t) ast_gen;
130 os_atomic(ast_gen_t) ast_ack;
131 };
132
133 static struct ast_gen_pair PERCPU_DATA(ast_gen_pair);
134 struct sched_statistics PERCPU_DATA(sched_stats);
135 bool sched_stats_active;
136
137 #define AST_GEN_CMP(a, op, b) ((long)((a) - (b)) op 0)
138
139 __startup_func
140 static void
ast_gen_init(void)141 ast_gen_init(void)
142 {
143 percpu_foreach(pair, ast_gen_pair) {
144 os_atomic_init(&pair->ast_gen, 1);
145 os_atomic_init(&pair->ast_ack, 1);
146 }
147 }
148 STARTUP(PERCPU, STARTUP_RANK_MIDDLE, ast_gen_init);
149
150 static uint64_t
deadline_add(uint64_t d,uint64_t e)151 deadline_add(uint64_t d, uint64_t e)
152 {
153 uint64_t sum;
154 return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
155 }
156
157 int
rt_runq_count(processor_set_t pset)158 rt_runq_count(processor_set_t pset)
159 {
160 return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
161 }
162
163 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)164 rt_runq_earliest_deadline(processor_set_t pset)
165 {
166 return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
167 }
168
169 static int
rt_runq_priority(processor_set_t pset)170 rt_runq_priority(processor_set_t pset)
171 {
172 pset_assert_locked(pset);
173 rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
174
175 bitmap_t *map = rt_run_queue->bitmap;
176 int i = bitmap_first(map, NRTQS);
177 assert(i < NRTQS);
178
179 if (i >= 0) {
180 return i + BASEPRI_RTQUEUES;
181 }
182
183 return i;
184 }
185
186 static thread_t rt_runq_first(rt_queue_t rt_runq);
187
188 #if DEBUG
189 static void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)190 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
191 {
192 bitmap_t *map = rt_run_queue->bitmap;
193
194 uint64_t earliest_deadline = RT_DEADLINE_NONE;
195 uint32_t constraint = RT_CONSTRAINT_NONE;
196 int ed_index = NOPRI;
197 int count = 0;
198 bool found_thread = false;
199
200 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
201 int i = pri - BASEPRI_RTQUEUES;
202 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
203 queue_t queue = &rt_runq->pri_queue;
204 queue_entry_t iter;
205 int n = 0;
206 uint64_t previous_deadline = 0;
207 qe_foreach(iter, queue) {
208 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
209 assert_thread_magic(iter_thread);
210 if (iter_thread == thread) {
211 found_thread = true;
212 }
213 assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
214 assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
215 assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
216 assert(previous_deadline <= iter_thread->realtime.deadline);
217 n++;
218 if (iter == queue_first(queue)) {
219 assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
220 assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
221 }
222 previous_deadline = iter_thread->realtime.deadline;
223 }
224 assert(n == rt_runq->pri_count);
225 if (n == 0) {
226 assert(bitmap_test(map, i) == false);
227 assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
228 assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
229 } else {
230 assert(bitmap_test(map, i) == true);
231 }
232 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
233 earliest_deadline = rt_runq->pri_earliest_deadline;
234 constraint = rt_runq->pri_constraint;
235 ed_index = i;
236 }
237 count += n;
238 }
239 assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
240 assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
241 assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
242 assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
243 if (thread) {
244 assert(found_thread);
245 }
246 }
247 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th)
248 #else
249 #define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0)
250 #endif
251
252 uint32_t rt_constraint_threshold;
253
254 static bool
rt_runq_is_low_latency(processor_set_t pset)255 rt_runq_is_low_latency(processor_set_t pset)
256 {
257 return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
258 }
259
260 #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
261 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
262
263 #define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
264 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
265
266 #define MAX_UNSAFE_QUANTA 800
267 TUNABLE(int, max_unsafe_quanta, "unsafe", MAX_UNSAFE_QUANTA);
268
269 #define MAX_POLL_QUANTA 2
270 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
271
272 #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
273 int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
274
275 uint64_t max_poll_computation;
276
277 uint64_t max_unsafe_computation;
278 uint64_t sched_safe_duration;
279
280 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
281
282 uint32_t std_quantum;
283 uint32_t min_std_quantum;
284 uint32_t bg_quantum;
285
286 uint32_t std_quantum_us;
287 uint32_t bg_quantum_us;
288
289 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
290
291 uint32_t thread_depress_time;
292 uint32_t default_timeshare_computation;
293 uint32_t default_timeshare_constraint;
294
295 uint32_t max_rt_quantum;
296 uint32_t min_rt_quantum;
297
298 uint32_t rt_deadline_epsilon;
299
300 uint32_t rt_constraint_threshold;
301 uint32_t rt_constraint_ll;
302
303 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
304
305 unsigned sched_tick;
306 uint32_t sched_tick_interval;
307
308 /* Timeshare load calculation interval (15ms) */
309 uint32_t sched_load_compute_interval_us = 15000;
310 uint64_t sched_load_compute_interval_abs;
311 static _Atomic uint64_t sched_load_compute_deadline;
312
313 uint32_t sched_pri_shifts[TH_BUCKET_MAX];
314 uint32_t sched_fixed_shift;
315
316 uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
317
318 /* Allow foreground to decay past default to resolve inversions */
319 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
320 int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
321
322 /* Defaults for timer deadline profiling */
323 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
324 * 2ms */
325 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
326 * <= 5ms */
327
328 uint64_t timer_deadline_tracking_bin_1;
329 uint64_t timer_deadline_tracking_bin_2;
330
331 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
332
333 thread_t sched_maintenance_thread;
334
335 /* interrupts disabled lock to guard recommended cores state */
336 decl_simple_lock_data(static, sched_recommended_cores_lock);
337 static uint64_t usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
338 static void sched_update_recommended_cores(uint64_t recommended_cores);
339
340 #if __arm__ || __arm64__
341 static void sched_recommended_cores_maintenance(void);
342 uint64_t perfcontrol_failsafe_starvation_threshold;
343 extern char *proc_name_address(struct proc *p);
344 #endif /* __arm__ || __arm64__ */
345
346 uint64_t sched_one_second_interval;
347 boolean_t allow_direct_handoff = TRUE;
348
349 /* Forwards */
350
351 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
352
353 static void load_shift_init(void);
354 static void preempt_pri_init(void);
355
356 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
357
358 thread_t processor_idle(
359 thread_t thread,
360 processor_t processor);
361
362 static ast_t
363 csw_check_locked(
364 thread_t thread,
365 processor_t processor,
366 processor_set_t pset,
367 ast_t check_reason);
368
369 static void processor_setrun(
370 processor_t processor,
371 thread_t thread,
372 integer_t options);
373
374 static void
375 sched_realtime_timebase_init(void);
376
377 static void
378 sched_timer_deadline_tracking_init(void);
379
380 #if DEBUG
381 extern int debug_task;
382 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
383 #else
384 #define TLOG(a, fmt, args...) do {} while (0)
385 #endif
386
387 static processor_t
388 thread_bind_internal(
389 thread_t thread,
390 processor_t processor);
391
392 static void
393 sched_vm_group_maintenance(void);
394
395 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
396 int8_t sched_load_shifts[NRQS];
397 bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
398 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
399
400 /*
401 * Statically allocate a buffer to hold the longest possible
402 * scheduler description string, as currently implemented.
403 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
404 * to export to userspace via sysctl(3). If either version
405 * changes, update the other.
406 *
407 * Note that in addition to being an upper bound on the strings
408 * in the kernel, it's also an exact parameter to PE_get_default(),
409 * which interrogates the device tree on some platforms. That
410 * API requires the caller know the exact size of the device tree
411 * property, so we need both a legacy size (32) and the current size
412 * (48) to deal with old and new device trees. The device tree property
413 * is similarly padded to a fixed size so that the same kernel image
414 * can run on multiple devices with different schedulers configured
415 * in the device tree.
416 */
417 char sched_string[SCHED_STRING_MAX_LENGTH];
418
419 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
420
421 /* Global flag which indicates whether Background Stepper Context is enabled */
422 static int cpu_throttle_enabled = 1;
423
424 #if DEVELOPMENT || DEBUG
425 int enable_task_set_cluster_type = 0;
426 bool system_ecore_only = false;
427 #endif /* DEVELOPMENT || DEBUG */
428
429 void
sched_init(void)430 sched_init(void)
431 {
432 boolean_t direct_handoff = FALSE;
433 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
434
435 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
436 /* No boot-args, check in device tree */
437 if (!PE_get_default("kern.sched_pri_decay_limit",
438 &sched_pri_decay_band_limit,
439 sizeof(sched_pri_decay_band_limit))) {
440 /* Allow decay all the way to normal limits */
441 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
442 }
443 }
444
445 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
446
447 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
448 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
449 }
450 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
451
452 cpu_quiescent_counter_init();
453
454 SCHED(init)();
455 SCHED(rt_init)(&pset0);
456 sched_timer_deadline_tracking_init();
457
458 SCHED(pset_init)(&pset0);
459 SCHED(processor_init)(master_processor);
460
461 if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
462 allow_direct_handoff = direct_handoff;
463 }
464
465 #if DEVELOPMENT || DEBUG
466 if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
467 system_ecore_only = (enable_task_set_cluster_type == 2);
468 }
469 #endif /* DEVELOPMENT || DEBUG */
470 }
471
472 void
sched_timebase_init(void)473 sched_timebase_init(void)
474 {
475 uint64_t abstime;
476
477 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
478 sched_one_second_interval = abstime;
479
480 SCHED(timebase_init)();
481 sched_realtime_timebase_init();
482 }
483
484 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
485
486 void
sched_timeshare_init(void)487 sched_timeshare_init(void)
488 {
489 /*
490 * Calculate the timeslicing quantum
491 * in us.
492 */
493 if (default_preemption_rate < 1) {
494 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
495 }
496 std_quantum_us = (1000 * 1000) / default_preemption_rate;
497
498 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
499
500 if (default_bg_preemption_rate < 1) {
501 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
502 }
503 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
504
505 printf("standard background quantum is %d us\n", bg_quantum_us);
506
507 load_shift_init();
508 preempt_pri_init();
509 sched_tick = 0;
510 }
511
512 void
sched_timeshare_timebase_init(void)513 sched_timeshare_timebase_init(void)
514 {
515 uint64_t abstime;
516 uint32_t shift;
517
518 /* standard timeslicing quantum */
519 clock_interval_to_absolutetime_interval(
520 std_quantum_us, NSEC_PER_USEC, &abstime);
521 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
522 std_quantum = (uint32_t)abstime;
523
524 /* smallest remaining quantum (250 us) */
525 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
526 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
527 min_std_quantum = (uint32_t)abstime;
528
529 /* quantum for background tasks */
530 clock_interval_to_absolutetime_interval(
531 bg_quantum_us, NSEC_PER_USEC, &abstime);
532 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
533 bg_quantum = (uint32_t)abstime;
534
535 /* scheduler tick interval */
536 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
537 NSEC_PER_USEC, &abstime);
538 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
539 sched_tick_interval = (uint32_t)abstime;
540
541 /* timeshare load calculation interval & deadline initialization */
542 clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
543 os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
544
545 /*
546 * Compute conversion factor from usage to
547 * timesharing priorities with 5/8 ** n aging.
548 */
549 abstime = (abstime * 5) / 3;
550 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
551 abstime >>= 1;
552 }
553 sched_fixed_shift = shift;
554
555 for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
556 sched_pri_shifts[i] = INT8_MAX;
557 }
558
559 max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
560 sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
561
562 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
563 thread_depress_time = 1 * std_quantum;
564 default_timeshare_computation = std_quantum / 2;
565 default_timeshare_constraint = std_quantum;
566
567 #if __arm__ || __arm64__
568 perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
569 #endif /* __arm__ || __arm64__ */
570 }
571
572 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
573
574 void
pset_rt_init(processor_set_t pset)575 pset_rt_init(processor_set_t pset)
576 {
577 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
578 int i = pri - BASEPRI_RTQUEUES;
579 rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
580 queue_init(&rqi->pri_queue);
581 rqi->pri_count = 0;
582 rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
583 rqi->pri_constraint = RT_CONSTRAINT_NONE;
584 }
585 os_atomic_init(&pset->rt_runq.count, 0);
586 os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
587 os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
588 os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
589 memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
590 }
591
592 /* constraint limit for low latency RT threads */
593 int rt_constraint_ll_us = 0;
594
595 int
sched_get_rt_constraint_ll(void)596 sched_get_rt_constraint_ll(void)
597 {
598 return rt_constraint_ll_us;
599 }
600
601 void
sched_set_rt_constraint_ll(int new_constraint_us)602 sched_set_rt_constraint_ll(int new_constraint_us)
603 {
604 rt_constraint_ll_us = new_constraint_us;
605
606 uint64_t abstime;
607 clock_interval_to_absolutetime_interval(rt_constraint_ll_us, NSEC_PER_USEC, &abstime);
608 assert((abstime >> 32) == 0 && ((rt_constraint_ll_us == 0) || (uint32_t)abstime != 0));
609 rt_constraint_ll = (uint32_t)abstime;
610 }
611
612 /* epsilon for comparing RT deadlines */
613 int rt_deadline_epsilon_us = 100;
614
615 int
sched_get_rt_deadline_epsilon(void)616 sched_get_rt_deadline_epsilon(void)
617 {
618 return rt_deadline_epsilon_us;
619 }
620
621 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)622 sched_set_rt_deadline_epsilon(int new_epsilon_us)
623 {
624 rt_deadline_epsilon_us = new_epsilon_us;
625
626 uint64_t abstime;
627 clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
628 assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
629 rt_deadline_epsilon = (uint32_t)abstime;
630 }
631
632 static void
sched_realtime_timebase_init(void)633 sched_realtime_timebase_init(void)
634 {
635 uint64_t abstime;
636
637 /* smallest rt computation (50 us) */
638 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
639 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
640 min_rt_quantum = (uint32_t)abstime;
641
642 /* maximum rt computation (50 ms) */
643 clock_interval_to_absolutetime_interval(
644 50, 1000 * NSEC_PER_USEC, &abstime);
645 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
646 max_rt_quantum = (uint32_t)abstime;
647
648 /* constraint threshold for sending backup IPIs (4 ms) */
649 clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
650 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
651 rt_constraint_threshold = (uint32_t)abstime;
652
653 /* constraint limit for low latency RT threads */
654 sched_set_rt_constraint_ll(rt_constraint_ll_us);
655
656 /* epsilon for comparing deadlines */
657 sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
658 }
659
660 void
sched_check_spill(processor_set_t pset,thread_t thread)661 sched_check_spill(processor_set_t pset, thread_t thread)
662 {
663 (void)pset;
664 (void)thread;
665
666 return;
667 }
668
669 bool
sched_thread_should_yield(processor_t processor,thread_t thread)670 sched_thread_should_yield(processor_t processor, thread_t thread)
671 {
672 (void)thread;
673
674 return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
675 }
676
677 /* Default implementations of .steal_thread_enabled */
678 bool
sched_steal_thread_DISABLED(processor_set_t pset)679 sched_steal_thread_DISABLED(processor_set_t pset)
680 {
681 (void)pset;
682 return false;
683 }
684
685 bool
sched_steal_thread_enabled(processor_set_t pset)686 sched_steal_thread_enabled(processor_set_t pset)
687 {
688 return bit_count(pset->node->pset_map) > 1;
689 }
690
691 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
692
693 /*
694 * Set up values for timeshare
695 * loading factors.
696 */
697 static void
load_shift_init(void)698 load_shift_init(void)
699 {
700 int8_t k, *p = sched_load_shifts;
701 uint32_t i, j;
702
703 uint32_t sched_decay_penalty = 1;
704
705 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
706 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
707 }
708
709 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
710 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
711 }
712
713 if (sched_decay_penalty == 0) {
714 /*
715 * There is no penalty for timeshare threads for using too much
716 * CPU, so set all load shifts to INT8_MIN. Even under high load,
717 * sched_pri_shift will be >INT8_MAX, and there will be no
718 * penalty applied to threads (nor will sched_usage be updated per
719 * thread).
720 */
721 for (i = 0; i < NRQS; i++) {
722 sched_load_shifts[i] = INT8_MIN;
723 }
724
725 return;
726 }
727
728 *p++ = INT8_MIN; *p++ = 0;
729
730 /*
731 * For a given system load "i", the per-thread priority
732 * penalty per quantum of CPU usage is ~2^k priority
733 * levels. "sched_decay_penalty" can cause more
734 * array entries to be filled with smaller "k" values
735 */
736 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
737 for (j <<= 1; (i < j) && (i < NRQS); ++i) {
738 *p++ = k;
739 }
740 }
741 }
742
743 static void
preempt_pri_init(void)744 preempt_pri_init(void)
745 {
746 bitmap_t *p = sched_preempt_pri;
747
748 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
749 bitmap_set(p, i);
750 }
751
752 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
753 bitmap_set(p, i);
754 }
755 }
756
757 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
758
759 void
check_monotonic_time(uint64_t ctime)760 check_monotonic_time(uint64_t ctime)
761 {
762 processor_t processor = current_processor();
763 uint64_t last_dispatch = processor->last_dispatch;
764
765 if (last_dispatch > ctime) {
766 panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
767 last_dispatch, ctime);
768 }
769 }
770
771
772 /*
773 * Thread wait timer expiration.
774 */
775 void
thread_timer_expire(void * p0,__unused void * p1)776 thread_timer_expire(
777 void *p0,
778 __unused void *p1)
779 {
780 thread_t thread = p0;
781 spl_t s;
782
783 assert_thread_magic(thread);
784
785 s = splsched();
786 thread_lock(thread);
787 if (--thread->wait_timer_active == 0) {
788 if (thread->wait_timer_is_set) {
789 thread->wait_timer_is_set = FALSE;
790 clear_wait_internal(thread, THREAD_TIMED_OUT);
791 }
792 }
793 thread_unlock(thread);
794 splx(s);
795 }
796
797 /*
798 * thread_unblock:
799 *
800 * Unblock thread on wake up.
801 *
802 * Returns TRUE if the thread should now be placed on the runqueue.
803 *
804 * Thread must be locked.
805 *
806 * Called at splsched().
807 */
808 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)809 thread_unblock(
810 thread_t thread,
811 wait_result_t wresult)
812 {
813 boolean_t ready_for_runq = FALSE;
814 thread_t cthread = current_thread();
815 uint32_t new_run_count;
816 int old_thread_state;
817
818 /*
819 * Set wait_result.
820 */
821 thread->wait_result = wresult;
822
823 /*
824 * Cancel pending wait timer.
825 */
826 if (thread->wait_timer_is_set) {
827 if (timer_call_cancel(thread->wait_timer)) {
828 thread->wait_timer_active--;
829 }
830 thread->wait_timer_is_set = FALSE;
831 }
832
833 boolean_t aticontext, pidle;
834 ml_get_power_state(&aticontext, &pidle);
835
836 /*
837 * Update scheduling state: not waiting,
838 * set running.
839 */
840 old_thread_state = thread->state;
841 thread->state = (old_thread_state | TH_RUN) &
842 ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT);
843
844 if ((old_thread_state & TH_RUN) == 0) {
845 uint64_t ctime = mach_approximate_time();
846
847 check_monotonic_time(ctime);
848
849 thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
850 timer_start(&thread->runnable_timer, ctime);
851
852 ready_for_runq = TRUE;
853
854 if (old_thread_state & TH_WAIT_REPORT) {
855 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
856 }
857
858 /* Update the runnable thread count */
859 new_run_count = SCHED(run_count_incr)(thread);
860
861 #if CONFIG_SCHED_AUTO_JOIN
862 if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
863 work_interval_auto_join_propagate(cthread, thread);
864 }
865 #endif /*CONFIG_SCHED_AUTO_JOIN */
866 } else {
867 /*
868 * Either the thread is idling in place on another processor,
869 * or it hasn't finished context switching yet.
870 */
871 assert((thread->state & TH_IDLE) == 0);
872 /*
873 * The run count is only dropped after the context switch completes
874 * and the thread is still waiting, so we should not run_incr here
875 */
876 new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
877 }
878
879 /*
880 * Calculate deadline for real-time threads.
881 */
882 if (thread->sched_mode == TH_MODE_REALTIME) {
883 uint64_t ctime = mach_absolute_time();
884 thread->realtime.deadline = thread->realtime.constraint + ctime;
885 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
886 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
887 }
888
889 /*
890 * Clear old quantum, fail-safe computation, etc.
891 */
892 thread->quantum_remaining = 0;
893 thread->computation_metered = 0;
894 thread->reason = AST_NONE;
895 thread->block_hint = kThreadWaitNone;
896
897 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
898 * We also account for "double hop" thread signaling via
899 * the thread callout infrastructure.
900 * DRK: consider removing the callout wakeup counters in the future
901 * they're present for verification at the moment.
902 */
903
904 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
905 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
906
907 uint64_t ttd = current_processor()->timer_call_ttd;
908
909 if (ttd) {
910 if (ttd <= timer_deadline_tracking_bin_1) {
911 thread->thread_timer_wakeups_bin_1++;
912 } else if (ttd <= timer_deadline_tracking_bin_2) {
913 thread->thread_timer_wakeups_bin_2++;
914 }
915 }
916
917 ledger_credit_thread(thread, thread->t_ledger,
918 task_ledgers.interrupt_wakeups, 1);
919 if (pidle) {
920 ledger_credit_thread(thread, thread->t_ledger,
921 task_ledgers.platform_idle_wakeups, 1);
922 }
923 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
924 /* TODO: what about an interrupt that does a wake taken on a callout thread? */
925 if (cthread->callout_woken_from_icontext) {
926 ledger_credit_thread(thread, thread->t_ledger,
927 task_ledgers.interrupt_wakeups, 1);
928 thread->thread_callout_interrupt_wakeups++;
929
930 if (cthread->callout_woken_from_platform_idle) {
931 ledger_credit_thread(thread, thread->t_ledger,
932 task_ledgers.platform_idle_wakeups, 1);
933 thread->thread_callout_platform_idle_wakeups++;
934 }
935
936 cthread->callout_woke_thread = TRUE;
937 }
938 }
939
940 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
941 thread->callout_woken_from_icontext = !!aticontext;
942 thread->callout_woken_from_platform_idle = !!pidle;
943 thread->callout_woke_thread = FALSE;
944 }
945
946 #if KPERF
947 if (ready_for_runq) {
948 kperf_make_runnable(thread, aticontext);
949 }
950 #endif /* KPERF */
951
952 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
953 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
954 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
955 sched_run_buckets[TH_BUCKET_RUN], 0);
956
957 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
958
959 return ready_for_runq;
960 }
961
962 /*
963 * Routine: thread_allowed_for_handoff
964 * Purpose:
965 * Check if the thread is allowed for handoff operation
966 * Conditions:
967 * thread lock held, IPC locks may be held.
968 * TODO: In future, do not allow handoff if threads have different cluster
969 * recommendations.
970 */
971 boolean_t
thread_allowed_for_handoff(thread_t thread)972 thread_allowed_for_handoff(
973 thread_t thread)
974 {
975 thread_t self = current_thread();
976
977 if (allow_direct_handoff &&
978 thread->sched_mode == TH_MODE_REALTIME &&
979 self->sched_mode == TH_MODE_REALTIME) {
980 return TRUE;
981 }
982
983 return FALSE;
984 }
985
986 /*
987 * Routine: thread_go
988 * Purpose:
989 * Unblock and dispatch thread.
990 * Conditions:
991 * thread lock held, IPC locks may be held.
992 * thread must have been pulled from wait queue under same lock hold.
993 * thread must have been waiting
994 * Returns:
995 * KERN_SUCCESS - Thread was set running
996 *
997 * TODO: This should return void
998 */
999 kern_return_t
thread_go(thread_t thread,wait_result_t wresult,waitq_options_t option)1000 thread_go(
1001 thread_t thread,
1002 wait_result_t wresult,
1003 waitq_options_t option)
1004 {
1005 thread_t self = current_thread();
1006
1007 assert_thread_magic(thread);
1008
1009 assert(thread->at_safe_point == FALSE);
1010 assert(thread->wait_event == NO_EVENT64);
1011 assert(waitq_wait_possible(thread));
1012
1013 assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1014 assert(thread->state & TH_WAIT);
1015
1016
1017 if (thread_unblock(thread, wresult)) {
1018 #if SCHED_TRACE_THREAD_WAKEUPS
1019 backtrace(&thread->thread_wakeup_bt[0],
1020 (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1021 NULL);
1022 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1023 if ((option & WQ_OPTION_HANDOFF) &&
1024 thread_allowed_for_handoff(thread)) {
1025 thread_reference(thread);
1026 assert(self->handoff_thread == NULL);
1027 self->handoff_thread = thread;
1028 } else {
1029 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1030 }
1031 }
1032
1033 return KERN_SUCCESS;
1034 }
1035
1036 /*
1037 * Routine: thread_mark_wait_locked
1038 * Purpose:
1039 * Mark a thread as waiting. If, given the circumstances,
1040 * it doesn't want to wait (i.e. already aborted), then
1041 * indicate that in the return value.
1042 * Conditions:
1043 * at splsched() and thread is locked.
1044 */
1045 __private_extern__
1046 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1047 thread_mark_wait_locked(
1048 thread_t thread,
1049 wait_interrupt_t interruptible_orig)
1050 {
1051 boolean_t at_safe_point;
1052 wait_interrupt_t interruptible = interruptible_orig;
1053
1054 if (thread->state & TH_IDLE) {
1055 panic("Invalid attempt to wait while running the idle thread");
1056 }
1057
1058 assert(!(thread->state & (TH_WAIT | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1059
1060 /*
1061 * The thread may have certain types of interrupts/aborts masked
1062 * off. Even if the wait location says these types of interrupts
1063 * are OK, we have to honor mask settings (outer-scoped code may
1064 * not be able to handle aborts at the moment).
1065 */
1066 interruptible &= TH_OPT_INTMASK;
1067 if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1068 interruptible = thread->options & TH_OPT_INTMASK;
1069 }
1070
1071 at_safe_point = (interruptible == THREAD_ABORTSAFE);
1072
1073 if (interruptible == THREAD_UNINT ||
1074 !(thread->sched_flags & TH_SFLAG_ABORT) ||
1075 (!at_safe_point &&
1076 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1077 if (!(thread->state & TH_TERMINATE)) {
1078 DTRACE_SCHED(sleep);
1079 }
1080
1081 int state_bits = TH_WAIT;
1082 if (!interruptible) {
1083 state_bits |= TH_UNINT;
1084 }
1085 if (thread->sched_call) {
1086 wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1087 if (is_kerneltask(get_threadtask(thread))) {
1088 mask = THREAD_WAIT_NOREPORT_KERNEL;
1089 }
1090 if ((interruptible_orig & mask) == 0) {
1091 state_bits |= TH_WAIT_REPORT;
1092 }
1093 }
1094 thread->state |= state_bits;
1095 thread->at_safe_point = at_safe_point;
1096
1097 /* TODO: pass this through assert_wait instead, have
1098 * assert_wait just take a struct as an argument */
1099 assert(!thread->block_hint);
1100 thread->block_hint = thread->pending_block_hint;
1101 thread->pending_block_hint = kThreadWaitNone;
1102
1103 return thread->wait_result = THREAD_WAITING;
1104 } else {
1105 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1106 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1107 }
1108 }
1109 thread->pending_block_hint = kThreadWaitNone;
1110
1111 return thread->wait_result = THREAD_INTERRUPTED;
1112 }
1113
1114 /*
1115 * Routine: thread_interrupt_level
1116 * Purpose:
1117 * Set the maximum interruptible state for the
1118 * current thread. The effective value of any
1119 * interruptible flag passed into assert_wait
1120 * will never exceed this.
1121 *
1122 * Useful for code that must not be interrupted,
1123 * but which calls code that doesn't know that.
1124 * Returns:
1125 * The old interrupt level for the thread.
1126 */
1127 __private_extern__
1128 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1129 thread_interrupt_level(
1130 wait_interrupt_t new_level)
1131 {
1132 thread_t thread = current_thread();
1133 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1134
1135 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1136
1137 return result;
1138 }
1139
1140 /*
1141 * assert_wait:
1142 *
1143 * Assert that the current thread is about to go to
1144 * sleep until the specified event occurs.
1145 */
1146 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1147 assert_wait(
1148 event_t event,
1149 wait_interrupt_t interruptible)
1150 {
1151 if (__improbable(event == NO_EVENT)) {
1152 panic("%s() called with NO_EVENT", __func__);
1153 }
1154
1155 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1156 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1157 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1158
1159 struct waitq *waitq;
1160 waitq = global_eventq(event);
1161 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1162 }
1163
1164 /*
1165 * assert_wait_queue:
1166 *
1167 * Return the global waitq for the specified event
1168 */
1169 struct waitq *
assert_wait_queue(event_t event)1170 assert_wait_queue(
1171 event_t event)
1172 {
1173 return global_eventq(event);
1174 }
1175
1176 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1177 assert_wait_timeout(
1178 event_t event,
1179 wait_interrupt_t interruptible,
1180 uint32_t interval,
1181 uint32_t scale_factor)
1182 {
1183 thread_t thread = current_thread();
1184 wait_result_t wresult;
1185 uint64_t deadline;
1186 spl_t s;
1187
1188 if (__improbable(event == NO_EVENT)) {
1189 panic("%s() called with NO_EVENT", __func__);
1190 }
1191
1192 struct waitq *waitq;
1193 waitq = global_eventq(event);
1194
1195 s = splsched();
1196 waitq_lock(waitq);
1197
1198 clock_interval_to_deadline(interval, scale_factor, &deadline);
1199
1200 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1201 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1202 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1203
1204 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1205 interruptible,
1206 TIMEOUT_URGENCY_SYS_NORMAL,
1207 deadline, TIMEOUT_NO_LEEWAY,
1208 thread);
1209
1210 waitq_unlock(waitq);
1211 splx(s);
1212 return wresult;
1213 }
1214
1215 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1216 assert_wait_timeout_with_leeway(
1217 event_t event,
1218 wait_interrupt_t interruptible,
1219 wait_timeout_urgency_t urgency,
1220 uint32_t interval,
1221 uint32_t leeway,
1222 uint32_t scale_factor)
1223 {
1224 thread_t thread = current_thread();
1225 wait_result_t wresult;
1226 uint64_t deadline;
1227 uint64_t abstime;
1228 uint64_t slop;
1229 uint64_t now;
1230 spl_t s;
1231
1232 if (__improbable(event == NO_EVENT)) {
1233 panic("%s() called with NO_EVENT", __func__);
1234 }
1235
1236 now = mach_absolute_time();
1237 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1238 deadline = now + abstime;
1239
1240 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1241
1242 struct waitq *waitq;
1243 waitq = global_eventq(event);
1244
1245 s = splsched();
1246 waitq_lock(waitq);
1247
1248 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1249 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1250 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1251
1252 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1253 interruptible,
1254 urgency, deadline, slop,
1255 thread);
1256
1257 waitq_unlock(waitq);
1258 splx(s);
1259 return wresult;
1260 }
1261
1262 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1263 assert_wait_deadline(
1264 event_t event,
1265 wait_interrupt_t interruptible,
1266 uint64_t deadline)
1267 {
1268 thread_t thread = current_thread();
1269 wait_result_t wresult;
1270 spl_t s;
1271
1272 if (__improbable(event == NO_EVENT)) {
1273 panic("%s() called with NO_EVENT", __func__);
1274 }
1275
1276 struct waitq *waitq;
1277 waitq = global_eventq(event);
1278
1279 s = splsched();
1280 waitq_lock(waitq);
1281
1282 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1283 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1284 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1285
1286 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1287 interruptible,
1288 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1289 TIMEOUT_NO_LEEWAY, thread);
1290 waitq_unlock(waitq);
1291 splx(s);
1292 return wresult;
1293 }
1294
1295 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1296 assert_wait_deadline_with_leeway(
1297 event_t event,
1298 wait_interrupt_t interruptible,
1299 wait_timeout_urgency_t urgency,
1300 uint64_t deadline,
1301 uint64_t leeway)
1302 {
1303 thread_t thread = current_thread();
1304 wait_result_t wresult;
1305 spl_t s;
1306
1307 if (__improbable(event == NO_EVENT)) {
1308 panic("%s() called with NO_EVENT", __func__);
1309 }
1310
1311 struct waitq *waitq;
1312 waitq = global_eventq(event);
1313
1314 s = splsched();
1315 waitq_lock(waitq);
1316
1317 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1318 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1319 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1320
1321 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1322 interruptible,
1323 urgency, deadline, leeway,
1324 thread);
1325 waitq_unlock(waitq);
1326 splx(s);
1327 return wresult;
1328 }
1329
1330 /*
1331 * thread_isoncpu:
1332 *
1333 * Return TRUE if a thread is running on a processor such that an AST
1334 * is needed to pull it out of userspace execution, or if executing in
1335 * the kernel, bring to a context switch boundary that would cause
1336 * thread state to be serialized in the thread PCB.
1337 *
1338 * Thread locked, returns the same way. While locked, fields
1339 * like "state" cannot change. "runq" can change only from set to unset.
1340 */
1341 static inline boolean_t
thread_isoncpu(thread_t thread)1342 thread_isoncpu(thread_t thread)
1343 {
1344 /* Not running or runnable */
1345 if (!(thread->state & TH_RUN)) {
1346 return FALSE;
1347 }
1348
1349 /* Waiting on a runqueue, not currently running */
1350 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1351 if (thread->runq != PROCESSOR_NULL) {
1352 return FALSE;
1353 }
1354
1355 /*
1356 * Thread does not have a stack yet
1357 * It could be on the stack alloc queue or preparing to be invoked
1358 */
1359 if (!thread->kernel_stack) {
1360 return FALSE;
1361 }
1362
1363 /*
1364 * Thread must be running on a processor, or
1365 * about to run, or just did run. In all these
1366 * cases, an AST to the processor is needed
1367 * to guarantee that the thread is kicked out
1368 * of userspace and the processor has
1369 * context switched (and saved register state).
1370 */
1371 return TRUE;
1372 }
1373
1374 /*
1375 * thread_stop:
1376 *
1377 * Force a preemption point for a thread and wait
1378 * for it to stop running on a CPU. If a stronger
1379 * guarantee is requested, wait until no longer
1380 * runnable. Arbitrates access among
1381 * multiple stop requests. (released by unstop)
1382 *
1383 * The thread must enter a wait state and stop via a
1384 * separate means.
1385 *
1386 * Returns FALSE if interrupted.
1387 */
1388 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1389 thread_stop(
1390 thread_t thread,
1391 boolean_t until_not_runnable)
1392 {
1393 wait_result_t wresult;
1394 spl_t s = splsched();
1395 boolean_t oncpu;
1396
1397 wake_lock(thread);
1398 thread_lock(thread);
1399
1400 while (thread->state & TH_SUSP) {
1401 thread->wake_active = TRUE;
1402 thread_unlock(thread);
1403
1404 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1405 wake_unlock(thread);
1406 splx(s);
1407
1408 if (wresult == THREAD_WAITING) {
1409 wresult = thread_block(THREAD_CONTINUE_NULL);
1410 }
1411
1412 if (wresult != THREAD_AWAKENED) {
1413 return FALSE;
1414 }
1415
1416 s = splsched();
1417 wake_lock(thread);
1418 thread_lock(thread);
1419 }
1420
1421 thread->state |= TH_SUSP;
1422
1423 while ((oncpu = thread_isoncpu(thread)) ||
1424 (until_not_runnable && (thread->state & TH_RUN))) {
1425 processor_t processor;
1426
1427 if (oncpu) {
1428 assert(thread->state & TH_RUN);
1429 processor = thread->chosen_processor;
1430 cause_ast_check(processor);
1431 }
1432
1433 thread->wake_active = TRUE;
1434 thread_unlock(thread);
1435
1436 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1437 wake_unlock(thread);
1438 splx(s);
1439
1440 if (wresult == THREAD_WAITING) {
1441 wresult = thread_block(THREAD_CONTINUE_NULL);
1442 }
1443
1444 if (wresult != THREAD_AWAKENED) {
1445 thread_unstop(thread);
1446 return FALSE;
1447 }
1448
1449 s = splsched();
1450 wake_lock(thread);
1451 thread_lock(thread);
1452 }
1453
1454 thread_unlock(thread);
1455 wake_unlock(thread);
1456 splx(s);
1457
1458 /*
1459 * We return with the thread unlocked. To prevent it from
1460 * transitioning to a runnable state (or from TH_RUN to
1461 * being on the CPU), the caller must ensure the thread
1462 * is stopped via an external means (such as an AST)
1463 */
1464
1465 return TRUE;
1466 }
1467
1468 /*
1469 * thread_unstop:
1470 *
1471 * Release a previous stop request and set
1472 * the thread running if appropriate.
1473 *
1474 * Use only after a successful stop operation.
1475 */
1476 void
thread_unstop(thread_t thread)1477 thread_unstop(
1478 thread_t thread)
1479 {
1480 spl_t s = splsched();
1481
1482 wake_lock(thread);
1483 thread_lock(thread);
1484
1485 assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1486
1487 if (thread->state & TH_SUSP) {
1488 thread->state &= ~TH_SUSP;
1489
1490 if (thread->wake_active) {
1491 thread->wake_active = FALSE;
1492 thread_unlock(thread);
1493
1494 thread_wakeup(&thread->wake_active);
1495 wake_unlock(thread);
1496 splx(s);
1497
1498 return;
1499 }
1500 }
1501
1502 thread_unlock(thread);
1503 wake_unlock(thread);
1504 splx(s);
1505 }
1506
1507 /*
1508 * thread_wait:
1509 *
1510 * Wait for a thread to stop running. (non-interruptible)
1511 *
1512 */
1513 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1514 thread_wait(
1515 thread_t thread,
1516 boolean_t until_not_runnable)
1517 {
1518 wait_result_t wresult;
1519 boolean_t oncpu;
1520 processor_t processor;
1521 spl_t s = splsched();
1522
1523 wake_lock(thread);
1524 thread_lock(thread);
1525
1526 /*
1527 * Wait until not running on a CPU. If stronger requirement
1528 * desired, wait until not runnable. Assumption: if thread is
1529 * on CPU, then TH_RUN is set, so we're not waiting in any case
1530 * where the original, pure "TH_RUN" check would have let us
1531 * finish.
1532 */
1533 while ((oncpu = thread_isoncpu(thread)) ||
1534 (until_not_runnable && (thread->state & TH_RUN))) {
1535 if (oncpu) {
1536 assert(thread->state & TH_RUN);
1537 processor = thread->chosen_processor;
1538 cause_ast_check(processor);
1539 }
1540
1541 thread->wake_active = TRUE;
1542 thread_unlock(thread);
1543
1544 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1545 wake_unlock(thread);
1546 splx(s);
1547
1548 if (wresult == THREAD_WAITING) {
1549 thread_block(THREAD_CONTINUE_NULL);
1550 }
1551
1552 s = splsched();
1553 wake_lock(thread);
1554 thread_lock(thread);
1555 }
1556
1557 thread_unlock(thread);
1558 wake_unlock(thread);
1559 splx(s);
1560 }
1561
1562 /*
1563 * Routine: clear_wait_internal
1564 *
1565 * Clear the wait condition for the specified thread.
1566 * Start the thread executing if that is appropriate.
1567 * Arguments:
1568 * thread thread to awaken
1569 * result Wakeup result the thread should see
1570 * Conditions:
1571 * At splsched
1572 * the thread is locked.
1573 * Returns:
1574 * KERN_SUCCESS thread was rousted out a wait
1575 * KERN_FAILURE thread was waiting but could not be rousted
1576 * KERN_NOT_WAITING thread was not waiting
1577 */
1578 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1579 clear_wait_internal(
1580 thread_t thread,
1581 wait_result_t wresult)
1582 {
1583 waitq_t waitq = thread->waitq;
1584
1585 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1586 return KERN_FAILURE;
1587 }
1588
1589 if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1590 return KERN_NOT_WAITING;
1591 }
1592
1593 /* TODO: Can we instead assert TH_TERMINATE is not set? */
1594 if ((thread->state & (TH_WAIT | TH_TERMINATE)) != TH_WAIT) {
1595 return KERN_NOT_WAITING;
1596 }
1597
1598 return thread_go(thread, wresult, WQ_OPTION_NONE);
1599 }
1600
1601
1602 /*
1603 * clear_wait:
1604 *
1605 * Clear the wait condition for the specified thread. Start the thread
1606 * executing if that is appropriate.
1607 *
1608 * parameters:
1609 * thread thread to awaken
1610 * result Wakeup result the thread should see
1611 */
1612 kern_return_t
clear_wait(thread_t thread,wait_result_t result)1613 clear_wait(
1614 thread_t thread,
1615 wait_result_t result)
1616 {
1617 kern_return_t ret;
1618 spl_t s;
1619
1620 s = splsched();
1621 thread_lock(thread);
1622 ret = clear_wait_internal(thread, result);
1623 thread_unlock(thread);
1624 splx(s);
1625 return ret;
1626 }
1627
1628
1629 /*
1630 * thread_wakeup_prim:
1631 *
1632 * Common routine for thread_wakeup, thread_wakeup_with_result,
1633 * and thread_wakeup_one.
1634 *
1635 */
1636 kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1637 thread_wakeup_prim(
1638 event_t event,
1639 boolean_t one_thread,
1640 wait_result_t result)
1641 {
1642 if (__improbable(event == NO_EVENT)) {
1643 panic("%s() called with NO_EVENT", __func__);
1644 }
1645
1646 struct waitq *wq = global_eventq(event);
1647
1648 if (one_thread) {
1649 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1650 } else {
1651 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1652 }
1653 }
1654
1655 /*
1656 * Wakeup a specified thread if and only if it's waiting for this event
1657 */
1658 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1659 thread_wakeup_thread(
1660 event_t event,
1661 thread_t thread)
1662 {
1663 if (__improbable(event == NO_EVENT)) {
1664 panic("%s() called with NO_EVENT", __func__);
1665 }
1666
1667 if (__improbable(thread == THREAD_NULL)) {
1668 panic("%s() called with THREAD_NULL", __func__);
1669 }
1670
1671 struct waitq *wq = global_eventq(event);
1672
1673 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1674 }
1675
1676 /*
1677 * Wakeup a thread waiting on an event and promote it to a priority.
1678 *
1679 * Requires woken thread to un-promote itself when done.
1680 */
1681 kern_return_t
thread_wakeup_one_with_pri(event_t event,int priority)1682 thread_wakeup_one_with_pri(
1683 event_t event,
1684 int priority)
1685 {
1686 if (__improbable(event == NO_EVENT)) {
1687 panic("%s() called with NO_EVENT", __func__);
1688 }
1689
1690 struct waitq *wq = global_eventq(event);
1691
1692 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1693 }
1694
1695 /*
1696 * Wakeup a thread waiting on an event,
1697 * promote it to a priority,
1698 * and return a reference to the woken thread.
1699 *
1700 * Requires woken thread to un-promote itself when done.
1701 */
1702 thread_t
thread_wakeup_identify(event_t event,int priority)1703 thread_wakeup_identify(event_t event,
1704 int priority)
1705 {
1706 if (__improbable(event == NO_EVENT)) {
1707 panic("%s() called with NO_EVENT", __func__);
1708 }
1709
1710 struct waitq *wq = global_eventq(event);
1711
1712 return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1713 }
1714
1715 /*
1716 * thread_bind:
1717 *
1718 * Force the current thread to execute on the specified processor.
1719 * Takes effect after the next thread_block().
1720 *
1721 * Returns the previous binding. PROCESSOR_NULL means
1722 * not bound.
1723 *
1724 * XXX - DO NOT export this to users - XXX
1725 */
1726 processor_t
thread_bind(processor_t processor)1727 thread_bind(
1728 processor_t processor)
1729 {
1730 thread_t self = current_thread();
1731 processor_t prev;
1732 spl_t s;
1733
1734 s = splsched();
1735 thread_lock(self);
1736
1737 prev = thread_bind_internal(self, processor);
1738
1739 thread_unlock(self);
1740 splx(s);
1741
1742 return prev;
1743 }
1744
1745 /*
1746 * thread_bind_internal:
1747 *
1748 * If the specified thread is not the current thread, and it is currently
1749 * running on another CPU, a remote AST must be sent to that CPU to cause
1750 * the thread to migrate to its bound processor. Otherwise, the migration
1751 * will occur at the next quantum expiration or blocking point.
1752 *
1753 * When the thread is the current thread, and explicit thread_block() should
1754 * be used to force the current processor to context switch away and
1755 * let the thread migrate to the bound processor.
1756 *
1757 * Thread must be locked, and at splsched.
1758 */
1759
1760 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)1761 thread_bind_internal(
1762 thread_t thread,
1763 processor_t processor)
1764 {
1765 processor_t prev;
1766
1767 /* <rdar://problem/15102234> */
1768 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1769 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1770 assert(thread->runq == PROCESSOR_NULL);
1771
1772 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1773
1774 prev = thread->bound_processor;
1775 thread->bound_processor = processor;
1776
1777 return prev;
1778 }
1779
1780 /*
1781 * thread_vm_bind_group_add:
1782 *
1783 * The "VM bind group" is a special mechanism to mark a collection
1784 * of threads from the VM subsystem that, in general, should be scheduled
1785 * with only one CPU of parallelism. To accomplish this, we initially
1786 * bind all the threads to the master processor, which has the effect
1787 * that only one of the threads in the group can execute at once, including
1788 * preempting threads in the group that are a lower priority. Future
1789 * mechanisms may use more dynamic mechanisms to prevent the collection
1790 * of VM threads from using more CPU time than desired.
1791 *
1792 * The current implementation can result in priority inversions where
1793 * compute-bound priority 95 or realtime threads that happen to have
1794 * landed on the master processor prevent the VM threads from running.
1795 * When this situation is detected, we unbind the threads for one
1796 * scheduler tick to allow the scheduler to run the threads an
1797 * additional CPUs, before restoring the binding (assuming high latency
1798 * is no longer a problem).
1799 */
1800
1801 /*
1802 * The current max is provisioned for:
1803 * vm_compressor_swap_trigger_thread (92)
1804 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1805 * vm_pageout_continue (92)
1806 * memorystatus_thread (95)
1807 */
1808 #define MAX_VM_BIND_GROUP_COUNT (5)
1809 decl_simple_lock_data(static, sched_vm_group_list_lock);
1810 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1811 static int sched_vm_group_thread_count;
1812 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1813
1814 void
thread_vm_bind_group_add(void)1815 thread_vm_bind_group_add(void)
1816 {
1817 thread_t self = current_thread();
1818
1819 thread_reference(self);
1820 self->options |= TH_OPT_SCHED_VM_GROUP;
1821
1822 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1823 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1824 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1825 simple_unlock(&sched_vm_group_list_lock);
1826
1827 thread_bind(master_processor);
1828
1829 /* Switch to bound processor if not already there */
1830 thread_block(THREAD_CONTINUE_NULL);
1831 }
1832
1833 static void
sched_vm_group_maintenance(void)1834 sched_vm_group_maintenance(void)
1835 {
1836 uint64_t ctime = mach_absolute_time();
1837 uint64_t longtime = ctime - sched_tick_interval;
1838 int i;
1839 spl_t s;
1840 boolean_t high_latency_observed = FALSE;
1841 boolean_t runnable_and_not_on_runq_observed = FALSE;
1842 boolean_t bind_target_changed = FALSE;
1843 processor_t bind_target = PROCESSOR_NULL;
1844
1845 /* Make sure nobody attempts to add new threads while we are enumerating them */
1846 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1847
1848 s = splsched();
1849
1850 for (i = 0; i < sched_vm_group_thread_count; i++) {
1851 thread_t thread = sched_vm_group_thread_list[i];
1852 assert(thread != THREAD_NULL);
1853 thread_lock(thread);
1854 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
1855 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1856 high_latency_observed = TRUE;
1857 } else if (thread->runq == PROCESSOR_NULL) {
1858 /* There are some cases where a thread be transitiong that also fall into this case */
1859 runnable_and_not_on_runq_observed = TRUE;
1860 }
1861 }
1862 thread_unlock(thread);
1863
1864 if (high_latency_observed && runnable_and_not_on_runq_observed) {
1865 /* All the things we are looking for are true, stop looking */
1866 break;
1867 }
1868 }
1869
1870 splx(s);
1871
1872 if (sched_vm_group_temporarily_unbound) {
1873 /* If we turned off binding, make sure everything is OK before rebinding */
1874 if (!high_latency_observed) {
1875 /* rebind */
1876 bind_target_changed = TRUE;
1877 bind_target = master_processor;
1878 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1879 }
1880 } else {
1881 /*
1882 * Check if we're in a bad state, which is defined by high
1883 * latency with no core currently executing a thread. If a
1884 * single thread is making progress on a CPU, that means the
1885 * binding concept to reduce parallelism is working as
1886 * designed.
1887 */
1888 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1889 /* unbind */
1890 bind_target_changed = TRUE;
1891 bind_target = PROCESSOR_NULL;
1892 sched_vm_group_temporarily_unbound = TRUE;
1893 }
1894 }
1895
1896 if (bind_target_changed) {
1897 s = splsched();
1898 for (i = 0; i < sched_vm_group_thread_count; i++) {
1899 thread_t thread = sched_vm_group_thread_list[i];
1900 boolean_t removed;
1901 assert(thread != THREAD_NULL);
1902
1903 thread_lock(thread);
1904 removed = thread_run_queue_remove(thread);
1905 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1906 thread_bind_internal(thread, bind_target);
1907 } else {
1908 /*
1909 * Thread was in the middle of being context-switched-to,
1910 * or was in the process of blocking. To avoid switching the bind
1911 * state out mid-flight, defer the change if possible.
1912 */
1913 if (bind_target == PROCESSOR_NULL) {
1914 thread_bind_internal(thread, bind_target);
1915 } else {
1916 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1917 }
1918 }
1919
1920 if (removed) {
1921 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1922 }
1923 thread_unlock(thread);
1924 }
1925 splx(s);
1926 }
1927
1928 simple_unlock(&sched_vm_group_list_lock);
1929 }
1930
1931 #if defined(__x86_64__)
1932 #define SCHED_AVOID_CPU0 1
1933 #else
1934 #define SCHED_AVOID_CPU0 0
1935 #endif
1936
1937 int sched_allow_rt_smt = 1;
1938 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
1939 int sched_choose_first_fd_processor = 1;
1940 int sched_allow_rt_steal = 1;
1941 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
1942
1943 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
1944
1945 int
sched_get_rt_n_backup_processors(void)1946 sched_get_rt_n_backup_processors(void)
1947 {
1948 return sched_rt_n_backup_processors;
1949 }
1950
1951 void
sched_set_rt_n_backup_processors(int n)1952 sched_set_rt_n_backup_processors(int n)
1953 {
1954 if (n < 0) {
1955 n = 0;
1956 } else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
1957 n = SCHED_MAX_BACKUP_PROCESSORS;
1958 }
1959
1960 sched_rt_n_backup_processors = n;
1961 }
1962
1963 int sched_rt_runq_strict_priority = false;
1964
1965 inline static processor_set_t
change_locked_pset(processor_set_t current_pset,processor_set_t new_pset)1966 change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
1967 {
1968 if (current_pset != new_pset) {
1969 pset_unlock(current_pset);
1970 pset_lock(new_pset);
1971 }
1972
1973 return new_pset;
1974 }
1975
1976 /*
1977 * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
1978 * rebalancing opportunity exists when a core is (instantaneously) idle, but
1979 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
1980 * IPI thrash if this core does not remain idle following the load balancing ASTs
1981 * Idle "thrash", when IPI issue is followed by idle entry/core power down
1982 * followed by a wakeup shortly thereafter.
1983 */
1984
1985 #if (DEVELOPMENT || DEBUG)
1986 int sched_smt_balance = 1;
1987 #endif
1988
1989 /* Invoked with pset locked, returns with pset unlocked */
1990 void
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)1991 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
1992 {
1993 processor_t ast_processor = NULL;
1994
1995 #if (DEVELOPMENT || DEBUG)
1996 if (__improbable(sched_smt_balance == 0)) {
1997 goto smt_balance_exit;
1998 }
1999 #endif
2000
2001 assert(cprocessor == current_processor());
2002 if (cprocessor->is_SMT == FALSE) {
2003 goto smt_balance_exit;
2004 }
2005
2006 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2007
2008 /* Determine if both this processor and its sibling are idle,
2009 * indicating an SMT rebalancing opportunity.
2010 */
2011 if (sib_processor->state != PROCESSOR_IDLE) {
2012 goto smt_balance_exit;
2013 }
2014
2015 processor_t sprocessor;
2016
2017 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2018 uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2019 ~cpset->primary_map);
2020 for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2021 sprocessor = processor_array[cpuid];
2022 if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2023 (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2024 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2025 if (ipi_type != SCHED_IPI_NONE) {
2026 assert(sprocessor != cprocessor);
2027 ast_processor = sprocessor;
2028 break;
2029 }
2030 }
2031 }
2032
2033 smt_balance_exit:
2034 pset_unlock(cpset);
2035
2036 if (ast_processor) {
2037 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2038 sched_ipi_perform(ast_processor, ipi_type);
2039 }
2040 }
2041
2042 static cpumap_t
pset_available_cpumap(processor_set_t pset)2043 pset_available_cpumap(processor_set_t pset)
2044 {
2045 return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]) &
2046 pset->recommended_bitmask;
2047 }
2048
2049 int
pset_available_cpu_count(processor_set_t pset)2050 pset_available_cpu_count(processor_set_t pset)
2051 {
2052 return bit_count(pset_available_cpumap(pset));
2053 }
2054
2055 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2056 pset_available_but_not_running_cpumap(processor_set_t pset)
2057 {
2058 return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2059 pset->recommended_bitmask;
2060 }
2061
2062 bool
pset_has_stealable_threads(processor_set_t pset)2063 pset_has_stealable_threads(processor_set_t pset)
2064 {
2065 pset_assert_locked(pset);
2066
2067 cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2068 /*
2069 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2070 * available primary CPUs
2071 */
2072 avail_map &= pset->primary_map;
2073
2074 return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2075 }
2076
2077 static cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)2078 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2079 {
2080 cpumap_t avail_map = pset_available_cpumap(pset);
2081 if (!sched_allow_rt_smt) {
2082 /*
2083 * Secondary CPUs are not allowed to run RT threads, so
2084 * only primary CPUs should be included
2085 */
2086 avail_map &= pset->primary_map;
2087 }
2088
2089 return avail_map & ~pset->realtime_map;
2090 }
2091
2092 static bool
pset_needs_a_followup_IPI(processor_set_t pset)2093 pset_needs_a_followup_IPI(processor_set_t pset)
2094 {
2095 int nbackup_cpus = 0;
2096
2097 if (rt_runq_is_low_latency(pset)) {
2098 nbackup_cpus = sched_rt_n_backup_processors;
2099 }
2100
2101 int rt_rq_count = rt_runq_count(pset);
2102
2103 return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
2104 }
2105
2106 bool
pset_has_stealable_rt_threads(processor_set_t pset)2107 pset_has_stealable_rt_threads(processor_set_t pset)
2108 {
2109 pset_node_t node = pset->node;
2110 if (bit_count(node->pset_map) == 1) {
2111 return false;
2112 }
2113
2114 cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2115
2116 return rt_runq_count(pset) > bit_count(avail_map);
2117 }
2118
2119 static void
pset_update_rt_stealable_state(processor_set_t pset)2120 pset_update_rt_stealable_state(processor_set_t pset)
2121 {
2122 if (pset_has_stealable_rt_threads(pset)) {
2123 pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2124 } else {
2125 pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2126 }
2127 }
2128
2129 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2130 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2131 {
2132 /* Acknowledge any pending IPIs here with pset lock held */
2133 pset_assert_locked(pset);
2134 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2135 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2136 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2137 }
2138 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2139
2140 #if defined(CONFIG_SCHED_DEFERRED_AST)
2141 bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2142 #endif
2143 }
2144
2145 /*
2146 * Called with pset locked, on a processor that is committing to run a new thread
2147 * Will transition an idle or dispatching processor to running as it picks up
2148 * the first new thread from the idle thread.
2149 */
2150 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2151 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2152 {
2153 pset_assert_locked(pset);
2154
2155 if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2156 assert(current_thread() == processor->idle_thread);
2157
2158 /*
2159 * Dispatching processor is now committed to running new_thread,
2160 * so change its state to PROCESSOR_RUNNING.
2161 */
2162 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2163 } else {
2164 assert((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_SHUTDOWN));
2165 }
2166
2167 processor_state_update_from_thread(processor, new_thread, true);
2168
2169 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2170 bit_set(pset->realtime_map, processor->cpu_id);
2171 } else {
2172 bit_clear(pset->realtime_map, processor->cpu_id);
2173 }
2174 pset_update_rt_stealable_state(pset);
2175
2176 pset_node_t node = pset->node;
2177
2178 if (bit_count(node->pset_map) == 1) {
2179 /* Node has only a single pset, so skip node pset map updates */
2180 return;
2181 }
2182
2183 cpumap_t avail_map = pset_available_cpumap(pset);
2184
2185 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2186 if ((avail_map & pset->realtime_map) == avail_map) {
2187 /* No more non-RT CPUs in this pset */
2188 atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2189 }
2190 avail_map &= pset->primary_map;
2191 if ((avail_map & pset->realtime_map) == avail_map) {
2192 /* No more non-RT primary CPUs in this pset */
2193 atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2194 }
2195 } else {
2196 if ((avail_map & pset->realtime_map) != avail_map) {
2197 if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2198 atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2199 }
2200 }
2201 avail_map &= pset->primary_map;
2202 if ((avail_map & pset->realtime_map) != avail_map) {
2203 if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2204 atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2205 }
2206 }
2207 }
2208 }
2209
2210 static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2211 static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2212 processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2213 static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2214 #if defined(__x86_64__)
2215 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2216 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2217 #endif
2218 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2219 static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2220
2221 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2222 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2223 {
2224 pset_map_t pset_map = stealing_pset->node->pset_map;
2225
2226 bit_clear(pset_map, stealing_pset->pset_id);
2227
2228 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2229 processor_set_t nset = pset_array[pset_id];
2230
2231 if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) {
2232 return true;
2233 }
2234 }
2235
2236 return false;
2237 }
2238
2239 /*
2240 * starting_pset must be locked, but returns true if it is unlocked before return
2241 */
2242 static bool
choose_next_rt_processor_for_IPI(processor_set_t starting_pset,processor_t chosen_processor,bool spill_ipi,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)2243 choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2244 processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2245 {
2246 bool starting_pset_is_unlocked = false;
2247 uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
2248 int max_pri = rt_runq_priority(starting_pset);
2249 __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
2250 if (rt_constraint_ll != 0) {
2251 uint64_t ctime = mach_absolute_time();
2252 if (earliest_deadline < rt_constraint_ll + ctime) {
2253 earliest_deadline = rt_constraint_ll + ctime;
2254 }
2255 }
2256 processor_set_t pset = starting_pset;
2257 processor_t next_rt_processor = PROCESSOR_NULL;
2258 if (spill_ipi) {
2259 processor_set_t nset = next_pset(pset);
2260 assert(nset != starting_pset);
2261 pset = change_locked_pset(pset, nset);
2262 starting_pset_is_unlocked = true;
2263 }
2264 do {
2265 const bool consider_secondaries = true;
2266 next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries);
2267 if (next_rt_processor == PROCESSOR_NULL) {
2268 if (!spill_ipi) {
2269 break;
2270 }
2271 processor_set_t nset = next_pset(pset);
2272 if (nset == starting_pset) {
2273 break;
2274 }
2275 pset = change_locked_pset(pset, nset);
2276 starting_pset_is_unlocked = true;
2277 }
2278 } while (next_rt_processor == PROCESSOR_NULL);
2279 if (next_rt_processor) {
2280 if (pset != starting_pset) {
2281 if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2282 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2283 next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2284 }
2285 }
2286 *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
2287 *result_processor = next_rt_processor;
2288 }
2289 if (pset != starting_pset) {
2290 pset_unlock(pset);
2291 }
2292
2293 return starting_pset_is_unlocked;
2294 }
2295
2296 /*
2297 * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2298 * followup processor - used in thread_select when there are still threads on the run queue and available processors
2299 * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2300 */
2301 typedef enum {
2302 none,
2303 backup,
2304 followup,
2305 spill
2306 } next_processor_type_t;
2307
2308 #undef LOOP_COUNT
2309 #ifdef LOOP_COUNT
2310 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2311 #endif
2312
2313 /*
2314 * thread_select:
2315 *
2316 * Select a new thread for the current processor to execute.
2317 *
2318 * May select the current thread, which must be locked.
2319 */
2320 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2321 thread_select(thread_t thread,
2322 processor_t processor,
2323 ast_t *reason)
2324 {
2325 processor_set_t pset = processor->processor_set;
2326 thread_t new_thread = THREAD_NULL;
2327
2328 assert(processor == current_processor());
2329 assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2330
2331 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2332 0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2333
2334 __kdebug_only int idle_reason = 0;
2335 __kdebug_only int delay_count = 0;
2336
2337 #if defined(__x86_64__)
2338 int timeout_count = sched_backup_cpu_timeout_count;
2339 if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2340 /* Prefer cpu0 as backup */
2341 timeout_count--;
2342 } else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2343 /* Prefer secondary cpu as backup */
2344 timeout_count--;
2345 }
2346 #endif
2347 bool pending_AST_URGENT = false;
2348 bool pending_AST_PREEMPT = false;
2349
2350 #ifdef LOOP_COUNT
2351 int loop_count = -1;
2352 #endif
2353
2354 do {
2355 /*
2356 * Update the priority.
2357 */
2358 if (SCHED(can_update_priority)(thread)) {
2359 SCHED(update_priority)(thread);
2360 }
2361
2362 pset_lock(pset);
2363
2364 restart:
2365 #ifdef LOOP_COUNT
2366 loop_count++;
2367 if (loop_count > max_loop_count[processor->cpu_id]) {
2368 max_loop_count[processor->cpu_id] = loop_count;
2369 if (bit_count(loop_count) == 1) {
2370 kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2371 }
2372 }
2373 #endif
2374 pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2375 pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2376
2377 processor_state_update_from_thread(processor, thread, true);
2378
2379 idle_reason = 0;
2380
2381 processor_t ast_processor = PROCESSOR_NULL;
2382 processor_t next_rt_processor = PROCESSOR_NULL;
2383 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2384 sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2385
2386 assert(processor->state != PROCESSOR_OFF_LINE);
2387
2388 /*
2389 * Bound threads are dispatched to a processor without going through
2390 * choose_processor(), so in those cases we must continue trying to dequeue work
2391 * as we are the only option.
2392 */
2393 if (!SCHED(processor_bound_count)(processor)) {
2394 if (!processor->is_recommended) {
2395 /*
2396 * The performance controller has provided a hint to not dispatch more threads,
2397 */
2398 idle_reason = 1;
2399 goto send_followup_ipi_before_idle;
2400 } else if (rt_runq_count(pset)) {
2401 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2402 /* Give the current RT thread a chance to complete */
2403 ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2404 #if defined(__x86_64__)
2405 /*
2406 * On Intel we want to avoid SMT secondary processors and processor 0
2407 * but allow them to be used as backup processors in case the preferred chosen
2408 * processor is delayed by interrupts or processor stalls. So if it is
2409 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2410 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2411 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2412 * to grab the thread before the (current) backup processor does.
2413 *
2414 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2415 * on DEVELOPMENT || DEBUG kernels. It is also adjusted (see above) depending on whether we want to use
2416 * cpu0 before secondary cpus or not.
2417 */
2418 if (!ok_to_run_realtime_thread) {
2419 if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2420 if (timeout_count-- > 0) {
2421 pset_unlock(pset);
2422 thread_unlock(thread);
2423 delay(10);
2424 delay_count++;
2425 thread_lock(thread);
2426 pset_lock(pset);
2427 goto restart;
2428 }
2429 ok_to_run_realtime_thread = true;
2430 }
2431 }
2432 #endif
2433 if (!ok_to_run_realtime_thread) {
2434 idle_reason = 2;
2435 goto send_followup_ipi_before_idle;
2436 }
2437 } else if (processor->processor_primary != processor) {
2438 /*
2439 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2440 * we should look for work only under the same conditions that choose_processor()
2441 * would have assigned work, which is when all primary processors have been assigned work.
2442 */
2443 if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2444 /* There are idle primaries */
2445 idle_reason = 3;
2446 goto idle;
2447 }
2448 }
2449 }
2450
2451 /*
2452 * Test to see if the current thread should continue
2453 * to run on this processor. Must not be attempting to wait, and not
2454 * bound to a different processor, nor be in the wrong
2455 * processor set, nor be forced to context switch by TH_SUSP.
2456 *
2457 * Note that there are never any RT threads in the regular runqueue.
2458 *
2459 * This code is very insanely tricky.
2460 */
2461
2462 /* i.e. not waiting, not TH_SUSP'ed */
2463 bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2464
2465 /*
2466 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2467 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2468 * <rdar://problem/47907700>
2469 *
2470 * A yielding thread shouldn't be forced to context switch.
2471 */
2472
2473 bool is_yielding = (*reason & AST_YIELD) == AST_YIELD;
2474
2475 bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2476
2477 bool affinity_mismatch = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2478
2479 bool bound_elsewhere = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2480
2481 bool avoid_processor = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread);
2482
2483 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2484
2485 bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2486 if (current_thread_can_keep_running) {
2487 /*
2488 * This thread is eligible to keep running on this processor.
2489 *
2490 * RT threads with un-expired quantum stay on processor,
2491 * unless there's a valid RT thread with an earlier deadline
2492 * and it is still ok_to_run_realtime_thread.
2493 */
2494 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2495 /*
2496 * Allow low latency realtime threads to keep running.
2497 * Pick a new RT thread only if ok_to_run_realtime_thread
2498 * (but the current thread is allowed to complete).
2499 */
2500 if ((thread->realtime.constraint > rt_constraint_ll) && ok_to_run_realtime_thread) {
2501 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2502 goto pick_new_rt_thread;
2503 }
2504 if (rt_runq_priority(pset) > thread->sched_pri) {
2505 if (sched_rt_runq_strict_priority) {
2506 /* The next RT thread is better, so pick it off the runqueue. */
2507 goto pick_new_rt_thread;
2508 }
2509
2510 /*
2511 * See if the current lower priority thread can continue to run without causing
2512 * the higher priority thread on the runq queue to miss its deadline.
2513 */
2514 thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2515 if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2516 /* The next RT thread is better, so pick it off the runqueue. */
2517 goto pick_new_rt_thread;
2518 }
2519 } else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2520 /* The next RT thread is better, so pick it off the runqueue. */
2521 goto pick_new_rt_thread;
2522 }
2523 if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2524 goto pick_new_rt_thread;
2525 }
2526 }
2527
2528 /* This is still the best RT thread to run. */
2529 processor->deadline = thread->realtime.deadline;
2530
2531 sched_update_pset_load_average(pset, 0);
2532
2533 clear_pending_AST_bits(pset, processor, 1);
2534
2535 next_rt_processor = PROCESSOR_NULL;
2536 next_rt_ipi_type = SCHED_IPI_NONE;
2537
2538 bool pset_unlocked = false;
2539 __kdebug_only next_processor_type_t nptype = none;
2540 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2541 nptype = spill;
2542 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2543 } else if (pset_needs_a_followup_IPI(pset)) {
2544 nptype = followup;
2545 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2546 }
2547 if (!pset_unlocked) {
2548 pset_unlock(pset);
2549 }
2550
2551 if (next_rt_processor) {
2552 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2553 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2554 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2555 }
2556
2557 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2558 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2559 return thread;
2560 }
2561
2562 if ((rt_runq_count(pset) == 0) &&
2563 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2564 /* This thread is still the highest priority runnable (non-idle) thread */
2565 processor->deadline = RT_DEADLINE_NONE;
2566
2567 sched_update_pset_load_average(pset, 0);
2568
2569 clear_pending_AST_bits(pset, processor, 2);
2570
2571 pset_unlock(pset);
2572
2573 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2574 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2575 return thread;
2576 }
2577 } else {
2578 /*
2579 * This processor must context switch.
2580 * If it's due to a rebalance, we should aggressively find this thread a new home.
2581 */
2582 if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) {
2583 *reason |= AST_REBALANCE;
2584 }
2585 }
2586
2587 bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2588 (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2589 (processor->processor_secondary->state == PROCESSOR_IDLE));
2590
2591 /* OK, so we're not going to run the current thread. Look at the RT queue. */
2592 if (ok_to_run_realtime_thread) {
2593 pick_new_rt_thread:
2594 new_thread = sched_rt_choose_thread(pset);
2595 if (new_thread != THREAD_NULL) {
2596 processor->deadline = new_thread->realtime.deadline;
2597 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2598
2599 clear_pending_AST_bits(pset, processor, 3);
2600
2601 if (processor->processor_secondary != NULL) {
2602 processor_t sprocessor = processor->processor_secondary;
2603 if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2604 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2605 ast_processor = sprocessor;
2606 }
2607 }
2608 }
2609 }
2610
2611 send_followup_ipi_before_idle:
2612 /* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2613 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2614 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2615 }
2616 __kdebug_only next_processor_type_t nptype = none;
2617 bool pset_unlocked = false;
2618 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2619 nptype = spill;
2620 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2621 } else if (pset_needs_a_followup_IPI(pset)) {
2622 nptype = followup;
2623 pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2624 }
2625
2626 assert(new_thread || !ast_processor);
2627 if (new_thread || next_rt_processor) {
2628 if (!pset_unlocked) {
2629 pset_unlock(pset);
2630 pset_unlocked = true;
2631 }
2632 if (ast_processor == next_rt_processor) {
2633 ast_processor = PROCESSOR_NULL;
2634 ipi_type = SCHED_IPI_NONE;
2635 }
2636
2637 if (ast_processor) {
2638 sched_ipi_perform(ast_processor, ipi_type);
2639 }
2640
2641 if (next_rt_processor) {
2642 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2643 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2644 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2645 }
2646
2647 if (new_thread) {
2648 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2649 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2650 return new_thread;
2651 }
2652 }
2653
2654 if (pset_unlocked) {
2655 pset_lock(pset);
2656 }
2657
2658 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2659 /* Things changed while we dropped the lock */
2660 goto restart;
2661 }
2662
2663 if (processor->is_recommended) {
2664 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2665 if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2666 /* Things changed while we dropped the lock */
2667 goto restart;
2668 }
2669
2670 if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2671 /* secondary can only run realtime thread */
2672 if (idle_reason == 0) {
2673 idle_reason = 4;
2674 }
2675 goto idle;
2676 }
2677 } else if (!SCHED(processor_bound_count)(processor)) {
2678 /* processor not recommended and no bound threads */
2679 if (idle_reason == 0) {
2680 idle_reason = 5;
2681 }
2682 goto idle;
2683 }
2684
2685 processor->deadline = RT_DEADLINE_NONE;
2686
2687 /* No RT threads, so let's look at the regular threads. */
2688 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2689 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2690
2691 clear_pending_AST_bits(pset, processor, 4);
2692
2693 ast_processor = PROCESSOR_NULL;
2694 ipi_type = SCHED_IPI_NONE;
2695
2696 processor_t sprocessor = processor->processor_secondary;
2697 if (sprocessor != NULL) {
2698 if (sprocessor->state == PROCESSOR_RUNNING) {
2699 if (thread_no_smt(new_thread)) {
2700 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2701 ast_processor = sprocessor;
2702 }
2703 } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
2704 ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
2705 ast_processor = sprocessor;
2706 }
2707 }
2708 pset_unlock(pset);
2709
2710 if (ast_processor) {
2711 sched_ipi_perform(ast_processor, ipi_type);
2712 }
2713 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2714 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2715 return new_thread;
2716 }
2717
2718 if (processor->must_idle) {
2719 processor->must_idle = false;
2720 *reason |= AST_REBALANCE;
2721 idle_reason = 6;
2722 goto idle;
2723 }
2724
2725 if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2726 /*
2727 * No runnable threads, attempt to steal
2728 * from other processors. Returns with pset lock dropped.
2729 */
2730
2731 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2732 pset_lock(pset);
2733 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2734 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2735 /*
2736 * A realtime thread choose this processor while it was DISPATCHING
2737 * and the pset lock was dropped
2738 */
2739 ast_on(AST_URGENT | AST_PREEMPT);
2740 }
2741
2742 clear_pending_AST_bits(pset, processor, 5);
2743
2744 pset_unlock(pset);
2745
2746 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2747 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2748 return new_thread;
2749 }
2750
2751 /*
2752 * If other threads have appeared, shortcut
2753 * around again.
2754 */
2755 if (SCHED(processor_bound_count)(processor)) {
2756 continue;
2757 }
2758 if (processor->is_recommended) {
2759 if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2760 continue;
2761 }
2762 }
2763
2764 pset_lock(pset);
2765 }
2766
2767 idle:
2768 /* Someone selected this processor while we had dropped the lock */
2769 if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2770 (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2771 goto restart;
2772 }
2773
2774 if ((idle_reason == 0) && current_thread_can_keep_running) {
2775 /* This thread is the only runnable (non-idle) thread */
2776 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2777 processor->deadline = thread->realtime.deadline;
2778 } else {
2779 processor->deadline = RT_DEADLINE_NONE;
2780 }
2781
2782 sched_update_pset_load_average(pset, 0);
2783
2784 clear_pending_AST_bits(pset, processor, 6);
2785
2786 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2787 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2788 pset_unlock(pset);
2789 return thread;
2790 }
2791
2792 /*
2793 * Nothing is runnable, or this processor must be forced idle,
2794 * so set this processor idle if it was running.
2795 */
2796 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2797 pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2798 processor_state_update_idle(processor);
2799 }
2800 pset_update_rt_stealable_state(pset);
2801
2802 clear_pending_AST_bits(pset, processor, 7);
2803
2804 /* Invoked with pset locked, returns with pset unlocked */
2805 SCHED(processor_balance)(processor, pset);
2806
2807 new_thread = processor->idle_thread;
2808 } while (new_thread == THREAD_NULL);
2809
2810 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2811 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2812 return new_thread;
2813 }
2814
2815 /*
2816 * thread_invoke
2817 *
2818 * Called at splsched with neither thread locked.
2819 *
2820 * Perform a context switch and start executing the new thread.
2821 *
2822 * Returns FALSE when the context switch didn't happen.
2823 * The reference to the new thread is still consumed.
2824 *
2825 * "self" is what is currently running on the processor,
2826 * "thread" is the new thread to context switch to
2827 * (which may be the same thread in some cases)
2828 */
2829 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)2830 thread_invoke(
2831 thread_t self,
2832 thread_t thread,
2833 ast_t reason)
2834 {
2835 if (__improbable(get_preemption_level() != 0)) {
2836 int pl = get_preemption_level();
2837 panic("thread_invoke: preemption_level %d, possible cause: %s",
2838 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2839 "blocking while holding a spinlock, or within interrupt context"));
2840 }
2841
2842 thread_continue_t continuation = self->continuation;
2843 void *parameter = self->parameter;
2844
2845 uint64_t ctime = mach_absolute_time();
2846
2847 check_monotonic_time(ctime);
2848
2849 #ifdef CONFIG_MACH_APPROXIMATE_TIME
2850 commpage_update_mach_approximate_time(ctime);
2851 #endif
2852
2853 if (ctime < thread->last_made_runnable_time) {
2854 panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
2855 ctime, thread->last_made_runnable_time);
2856 }
2857
2858 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
2859 if (!((thread->state & TH_IDLE) != 0 ||
2860 ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
2861 sched_timeshare_consider_maintenance(ctime);
2862 }
2863 #endif
2864
2865 #if MONOTONIC
2866 mt_sched_update(self);
2867 #endif /* MONOTONIC */
2868
2869 assert_thread_magic(self);
2870 assert(self == current_thread());
2871 assert(self->runq == PROCESSOR_NULL);
2872 assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2873
2874 thread_lock(thread);
2875
2876 assert_thread_magic(thread);
2877 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
2878 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2879 assert(thread->runq == PROCESSOR_NULL);
2880
2881 /* Reload precise timing global policy to thread-local policy */
2882 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2883
2884 /* Update SFI class based on other factors */
2885 thread->sfi_class = sfi_thread_classify(thread);
2886
2887 /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
2888 thread->same_pri_latency = ctime - thread->last_basepri_change_time;
2889 /*
2890 * In case a base_pri update happened between the timestamp and
2891 * taking the thread lock
2892 */
2893 if (ctime <= thread->last_basepri_change_time) {
2894 thread->same_pri_latency = ctime - thread->last_made_runnable_time;
2895 }
2896
2897 /* Allow realtime threads to hang onto a stack. */
2898 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
2899 self->reserved_stack = self->kernel_stack;
2900 }
2901
2902 /* Prepare for spin debugging */
2903 #if INTERRUPT_MASKED_DEBUG
2904 ml_spin_debug_clear(thread);
2905 #endif
2906
2907 if (continuation != NULL) {
2908 if (!thread->kernel_stack) {
2909 /*
2910 * If we are using a privileged stack,
2911 * check to see whether we can exchange it with
2912 * that of the other thread.
2913 */
2914 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
2915 goto need_stack;
2916 }
2917
2918 /*
2919 * Context switch by performing a stack handoff.
2920 * Requires both threads to be parked in a continuation.
2921 */
2922 continuation = thread->continuation;
2923 parameter = thread->parameter;
2924
2925 processor_t processor = current_processor();
2926 processor->active_thread = thread;
2927 processor_state_update_from_thread(processor, thread, false);
2928
2929 if (thread->last_processor != processor && thread->last_processor != NULL) {
2930 if (thread->last_processor->processor_set != processor->processor_set) {
2931 thread->ps_switch++;
2932 }
2933 thread->p_switch++;
2934 }
2935 thread->last_processor = processor;
2936 thread->c_switch++;
2937 ast_context(thread);
2938
2939 thread_unlock(thread);
2940
2941 self->reason = reason;
2942
2943 processor->last_dispatch = ctime;
2944 self->last_run_time = ctime;
2945 processor_timer_switch_thread(ctime, &thread->system_timer);
2946 timer_update(&thread->runnable_timer, ctime);
2947 processor->kernel_timer = &thread->system_timer;
2948
2949 /*
2950 * Since non-precise user/kernel time doesn't update the state timer
2951 * during privilege transitions, synthesize an event now.
2952 */
2953 if (!thread->precise_user_kernel_time) {
2954 timer_update(processor->current_state, ctime);
2955 }
2956
2957 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2958 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
2959 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2960
2961 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2962 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
2963 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2964 }
2965
2966 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
2967
2968 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2969
2970 #if KPERF
2971 kperf_off_cpu(self);
2972 #endif /* KPERF */
2973
2974 /*
2975 * This is where we actually switch thread identity,
2976 * and address space if required. However, register
2977 * state is not switched - this routine leaves the
2978 * stack and register state active on the current CPU.
2979 */
2980 TLOG(1, "thread_invoke: calling stack_handoff\n");
2981 stack_handoff(self, thread);
2982
2983 /* 'self' is now off core */
2984 assert(thread == current_thread_volatile());
2985
2986 DTRACE_SCHED(on__cpu);
2987
2988 #if KPERF
2989 kperf_on_cpu(thread, continuation, NULL);
2990 #endif /* KPERF */
2991
2992 thread_dispatch(self, thread);
2993
2994 #if KASAN
2995 /* Old thread's stack has been moved to the new thread, so explicitly
2996 * unpoison it. */
2997 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
2998 #endif
2999
3000 thread->continuation = thread->parameter = NULL;
3001
3002 boolean_t enable_interrupts = TRUE;
3003
3004 /* idle thread needs to stay interrupts-disabled */
3005 if ((thread->state & TH_IDLE)) {
3006 enable_interrupts = FALSE;
3007 }
3008
3009 assert(continuation);
3010 call_continuation(continuation, parameter,
3011 thread->wait_result, enable_interrupts);
3012 /*NOTREACHED*/
3013 } else if (thread == self) {
3014 /* same thread but with continuation */
3015 ast_context(self);
3016
3017 thread_unlock(self);
3018
3019 #if KPERF
3020 kperf_on_cpu(thread, continuation, NULL);
3021 #endif /* KPERF */
3022
3023 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3024 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3025 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3026
3027 #if KASAN
3028 /* stack handoff to self - no thread_dispatch(), so clear the stack
3029 * and free the fakestack directly */
3030 kasan_fakestack_drop(self);
3031 kasan_fakestack_gc(self);
3032 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3033 #endif
3034
3035 self->continuation = self->parameter = NULL;
3036
3037 boolean_t enable_interrupts = TRUE;
3038
3039 /* idle thread needs to stay interrupts-disabled */
3040 if ((self->state & TH_IDLE)) {
3041 enable_interrupts = FALSE;
3042 }
3043
3044 call_continuation(continuation, parameter,
3045 self->wait_result, enable_interrupts);
3046 /*NOTREACHED*/
3047 }
3048 } else {
3049 /*
3050 * Check that the other thread has a stack
3051 */
3052 if (!thread->kernel_stack) {
3053 need_stack:
3054 if (!stack_alloc_try(thread)) {
3055 thread_unlock(thread);
3056 thread_stack_enqueue(thread);
3057 return FALSE;
3058 }
3059 } else if (thread == self) {
3060 ast_context(self);
3061 thread_unlock(self);
3062
3063 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3064 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3065 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3066
3067 return TRUE;
3068 }
3069 }
3070
3071 /*
3072 * Context switch by full context save.
3073 */
3074 processor_t processor = current_processor();
3075 processor->active_thread = thread;
3076 processor_state_update_from_thread(processor, thread, false);
3077
3078 if (thread->last_processor != processor && thread->last_processor != NULL) {
3079 if (thread->last_processor->processor_set != processor->processor_set) {
3080 thread->ps_switch++;
3081 }
3082 thread->p_switch++;
3083 }
3084 thread->last_processor = processor;
3085 thread->c_switch++;
3086 ast_context(thread);
3087
3088 thread_unlock(thread);
3089
3090 self->reason = reason;
3091
3092 processor->last_dispatch = ctime;
3093 self->last_run_time = ctime;
3094 processor_timer_switch_thread(ctime, &thread->system_timer);
3095 timer_update(&thread->runnable_timer, ctime);
3096 processor->kernel_timer = &thread->system_timer;
3097
3098 /*
3099 * Since non-precise user/kernel time doesn't update the state timer
3100 * during privilege transitions, synthesize an event now.
3101 */
3102 if (!thread->precise_user_kernel_time) {
3103 timer_update(processor->current_state, ctime);
3104 }
3105
3106 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3107 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3108 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3109
3110 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3111 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3112 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3113 }
3114
3115 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3116
3117 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3118
3119 #if KPERF
3120 kperf_off_cpu(self);
3121 #endif /* KPERF */
3122
3123 /*
3124 * This is where we actually switch register context,
3125 * and address space if required. We will next run
3126 * as a result of a subsequent context switch.
3127 *
3128 * Once registers are switched and the processor is running "thread",
3129 * the stack variables and non-volatile registers will contain whatever
3130 * was there the last time that thread blocked. No local variables should
3131 * be used after this point, except for the special case of "thread", which
3132 * the platform layer returns as the previous thread running on the processor
3133 * via the function call ABI as a return register, and "self", which may have
3134 * been stored on the stack or a non-volatile register, but a stale idea of
3135 * what was on the CPU is newly-accurate because that thread is again
3136 * running on the CPU.
3137 *
3138 * If one of the threads is using a continuation, thread_continue
3139 * is used to stitch up its context.
3140 *
3141 * If we are invoking a thread which is resuming from a continuation,
3142 * the CPU will invoke thread_continue next.
3143 *
3144 * If the current thread is parking in a continuation, then its state
3145 * won't be saved and the stack will be discarded. When the stack is
3146 * re-allocated, it will be configured to resume from thread_continue.
3147 */
3148 assert(continuation == self->continuation);
3149 thread = machine_switch_context(self, continuation, thread);
3150 assert(self == current_thread_volatile());
3151 TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3152
3153 assert(continuation == NULL && self->continuation == NULL);
3154
3155 DTRACE_SCHED(on__cpu);
3156
3157 #if KPERF
3158 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3159 #endif /* KPERF */
3160
3161 /* We have been resumed and are set to run. */
3162 thread_dispatch(thread, self);
3163
3164 return TRUE;
3165 }
3166
3167 #if defined(CONFIG_SCHED_DEFERRED_AST)
3168 /*
3169 * pset_cancel_deferred_dispatch:
3170 *
3171 * Cancels all ASTs that we can cancel for the given processor set
3172 * if the current processor is running the last runnable thread in the
3173 * system.
3174 *
3175 * This function assumes the current thread is runnable. This must
3176 * be called with the pset unlocked.
3177 */
3178 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3179 pset_cancel_deferred_dispatch(
3180 processor_set_t pset,
3181 processor_t processor)
3182 {
3183 processor_t active_processor = NULL;
3184 uint32_t sampled_sched_run_count;
3185
3186 pset_lock(pset);
3187 sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3188
3189 /*
3190 * If we have emptied the run queue, and our current thread is runnable, we
3191 * should tell any processors that are still DISPATCHING that they will
3192 * probably not have any work to do. In the event that there are no
3193 * pending signals that we can cancel, this is also uninteresting.
3194 *
3195 * In the unlikely event that another thread becomes runnable while we are
3196 * doing this (sched_run_count is atomically updated, not guarded), the
3197 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3198 * in order to dispatch it to a processor in our pset. So, the other
3199 * codepath will wait while we squash all cancelable ASTs, get the pset
3200 * lock, and then dispatch the freshly runnable thread. So this should be
3201 * correct (we won't accidentally have a runnable thread that hasn't been
3202 * dispatched to an idle processor), if not ideal (we may be restarting the
3203 * dispatch process, which could have some overhead).
3204 */
3205
3206 if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3207 uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3208 pset->pending_deferred_AST_cpu_mask &
3209 ~pset->pending_AST_URGENT_cpu_mask);
3210 for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3211 active_processor = processor_array[cpuid];
3212 /*
3213 * If a processor is DISPATCHING, it could be because of
3214 * a cancelable signal.
3215 *
3216 * IF the processor is not our
3217 * current processor (the current processor should not
3218 * be DISPATCHING, so this is a bit paranoid), AND there
3219 * is a cancelable signal pending on the processor, AND
3220 * there is no non-cancelable signal pending (as there is
3221 * no point trying to backtrack on bringing the processor
3222 * up if a signal we cannot cancel is outstanding), THEN
3223 * it should make sense to roll back the processor state
3224 * to the IDLE state.
3225 *
3226 * If the racey nature of this approach (as the signal
3227 * will be arbitrated by hardware, and can fire as we
3228 * roll back state) results in the core responding
3229 * despite being pushed back to the IDLE state, it
3230 * should be no different than if the core took some
3231 * interrupt while IDLE.
3232 */
3233 if (active_processor != processor) {
3234 /*
3235 * Squash all of the processor state back to some
3236 * reasonable facsimile of PROCESSOR_IDLE.
3237 */
3238
3239 processor_state_update_idle(active_processor);
3240 active_processor->deadline = RT_DEADLINE_NONE;
3241 pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3242 bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3243 machine_signal_idle_cancel(active_processor);
3244 }
3245 }
3246 }
3247
3248 pset_unlock(pset);
3249 }
3250 #else
3251 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3252 #endif
3253
3254 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3255 thread_csw_callout(
3256 thread_t old,
3257 thread_t new,
3258 uint64_t timestamp)
3259 {
3260 perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3261 uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3262 machine_switch_perfcontrol_context(event, timestamp, 0,
3263 same_pri_latency, old, new);
3264 }
3265
3266
3267 /*
3268 * thread_dispatch:
3269 *
3270 * Handle threads at context switch. Re-dispatch other thread
3271 * if still running, otherwise update run state and perform
3272 * special actions. Update quantum for other thread and begin
3273 * the quantum for ourselves.
3274 *
3275 * "thread" is the old thread that we have switched away from.
3276 * "self" is the new current thread that we have context switched to
3277 *
3278 * Called at splsched.
3279 *
3280 */
3281 void
thread_dispatch(thread_t thread,thread_t self)3282 thread_dispatch(
3283 thread_t thread,
3284 thread_t self)
3285 {
3286 processor_t processor = self->last_processor;
3287 bool was_idle = false;
3288
3289 assert(processor == current_processor());
3290 assert(self == current_thread_volatile());
3291 assert(thread != self);
3292
3293 if (thread != THREAD_NULL) {
3294 /*
3295 * Do the perfcontrol callout for context switch.
3296 * The reason we do this here is:
3297 * - thread_dispatch() is called from various places that are not
3298 * the direct context switch path for eg. processor shutdown etc.
3299 * So adding the callout here covers all those cases.
3300 * - We want this callout as early as possible to be close
3301 * to the timestamp taken in thread_invoke()
3302 * - We want to avoid holding the thread lock while doing the
3303 * callout
3304 * - We do not want to callout if "thread" is NULL.
3305 */
3306 thread_csw_callout(thread, self, processor->last_dispatch);
3307
3308 #if KASAN
3309 if (thread->continuation != NULL) {
3310 /*
3311 * Thread has a continuation and the normal stack is going away.
3312 * Unpoison the stack and mark all fakestack objects as unused.
3313 */
3314 kasan_fakestack_drop(thread);
3315 if (thread->kernel_stack) {
3316 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3317 }
3318 }
3319
3320 /*
3321 * Free all unused fakestack objects.
3322 */
3323 kasan_fakestack_gc(thread);
3324 #endif
3325
3326 /*
3327 * If blocked at a continuation, discard
3328 * the stack.
3329 */
3330 if (thread->continuation != NULL && thread->kernel_stack != 0) {
3331 stack_free(thread);
3332 }
3333
3334 if (thread->state & TH_IDLE) {
3335 was_idle = true;
3336 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3337 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3338 (uintptr_t)thread_tid(thread), 0, thread->state,
3339 sched_run_buckets[TH_BUCKET_RUN], 0);
3340 } else {
3341 int64_t consumed;
3342 int64_t remainder = 0;
3343
3344 if (processor->quantum_end > processor->last_dispatch) {
3345 remainder = processor->quantum_end -
3346 processor->last_dispatch;
3347 }
3348
3349 consumed = thread->quantum_remaining - remainder;
3350
3351 if ((thread->reason & AST_LEDGER) == 0) {
3352 /*
3353 * Bill CPU time to both the task and
3354 * the individual thread.
3355 */
3356 ledger_credit_thread(thread, thread->t_ledger,
3357 task_ledgers.cpu_time, consumed);
3358 ledger_credit_thread(thread, thread->t_threadledger,
3359 thread_ledgers.cpu_time, consumed);
3360 if (thread->t_bankledger) {
3361 ledger_credit_thread(thread, thread->t_bankledger,
3362 bank_ledgers.cpu_time,
3363 (consumed - thread->t_deduct_bank_ledger_time));
3364 }
3365 thread->t_deduct_bank_ledger_time = 0;
3366 if (consumed > 0) {
3367 /*
3368 * This should never be negative, but in traces we are seeing some instances
3369 * of consumed being negative.
3370 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3371 */
3372 sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3373 }
3374 }
3375
3376 /* For the thread that we just context switched away from, figure
3377 * out if we have expired the wq quantum and set the AST if we have
3378 */
3379 if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3380 thread_evaluate_workqueue_quantum_expiry(thread);
3381 }
3382
3383 wake_lock(thread);
3384 thread_lock(thread);
3385
3386 /*
3387 * Apply a priority floor if the thread holds a kernel resource
3388 * or explicitly requested it.
3389 * Do this before checking starting_pri to avoid overpenalizing
3390 * repeated rwlock blockers.
3391 */
3392 if (__improbable(thread->rwlock_count != 0)) {
3393 lck_rw_set_promotion_locked(thread);
3394 }
3395 if (__improbable(thread->priority_floor_count != 0)) {
3396 thread_floor_boost_set_promotion_locked(thread);
3397 }
3398
3399 boolean_t keep_quantum = processor->first_timeslice;
3400
3401 /*
3402 * Treat a thread which has dropped priority since it got on core
3403 * as having expired its quantum.
3404 */
3405 if (processor->starting_pri > thread->sched_pri) {
3406 keep_quantum = FALSE;
3407 }
3408
3409 /* Compute remainder of current quantum. */
3410 if (keep_quantum &&
3411 processor->quantum_end > processor->last_dispatch) {
3412 thread->quantum_remaining = (uint32_t)remainder;
3413 } else {
3414 thread->quantum_remaining = 0;
3415 }
3416
3417 if (thread->sched_mode == TH_MODE_REALTIME) {
3418 /*
3419 * Cancel the deadline if the thread has
3420 * consumed the entire quantum.
3421 */
3422 if (thread->quantum_remaining == 0) {
3423 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3424 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3425 thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3426 }
3427 } else {
3428 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3429 /*
3430 * For non-realtime threads treat a tiny
3431 * remaining quantum as an expired quantum
3432 * but include what's left next time.
3433 */
3434 if (thread->quantum_remaining < min_std_quantum) {
3435 thread->reason |= AST_QUANTUM;
3436 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3437 }
3438 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3439 }
3440
3441 /*
3442 * If we are doing a direct handoff then
3443 * take the remainder of the quantum.
3444 */
3445 if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3446 self->quantum_remaining = thread->quantum_remaining;
3447 thread->reason |= AST_QUANTUM;
3448 thread->quantum_remaining = 0;
3449 } else {
3450 #if defined(CONFIG_SCHED_MULTIQ)
3451 if (SCHED(sched_groups_enabled) &&
3452 thread->sched_group == self->sched_group) {
3453 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3454 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3455 self->reason, (uintptr_t)thread_tid(thread),
3456 self->quantum_remaining, thread->quantum_remaining, 0);
3457
3458 self->quantum_remaining = thread->quantum_remaining;
3459 thread->quantum_remaining = 0;
3460 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
3461 }
3462 #endif /* defined(CONFIG_SCHED_MULTIQ) */
3463 }
3464
3465 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3466
3467 if (!(thread->state & TH_WAIT)) {
3468 /*
3469 * Still runnable.
3470 */
3471 thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3472
3473 machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3474
3475 ast_t reason = thread->reason;
3476 sched_options_t options = SCHED_NONE;
3477
3478 if (reason & AST_REBALANCE) {
3479 options |= SCHED_REBALANCE;
3480 if (reason & AST_QUANTUM) {
3481 /*
3482 * Having gone to the trouble of forcing this thread off a less preferred core,
3483 * we should force the preferable core to reschedule immediately to give this
3484 * thread a chance to run instead of just sitting on the run queue where
3485 * it may just be stolen back by the idle core we just forced it off.
3486 * But only do this at the end of a quantum to prevent cascading effects.
3487 */
3488 options |= SCHED_PREEMPT;
3489 }
3490 }
3491
3492 if (reason & AST_QUANTUM) {
3493 options |= SCHED_TAILQ;
3494 } else if (reason & AST_PREEMPT) {
3495 options |= SCHED_HEADQ;
3496 } else {
3497 options |= (SCHED_PREEMPT | SCHED_TAILQ);
3498 }
3499
3500 thread_setrun(thread, options);
3501
3502 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3503 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3504 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3505 sched_run_buckets[TH_BUCKET_RUN], 0);
3506
3507 if (thread->wake_active) {
3508 thread->wake_active = FALSE;
3509 thread_unlock(thread);
3510
3511 thread_wakeup(&thread->wake_active);
3512 } else {
3513 thread_unlock(thread);
3514 }
3515
3516 wake_unlock(thread);
3517 } else {
3518 /*
3519 * Waiting.
3520 */
3521 boolean_t should_terminate = FALSE;
3522 uint32_t new_run_count;
3523 int thread_state = thread->state;
3524
3525 /* Only the first call to thread_dispatch
3526 * after explicit termination should add
3527 * the thread to the termination queue
3528 */
3529 if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3530 should_terminate = TRUE;
3531 thread_state |= TH_TERMINATE2;
3532 }
3533
3534 timer_stop(&thread->runnable_timer, processor->last_dispatch);
3535
3536 thread_state &= ~TH_RUN;
3537 thread->state = thread_state;
3538
3539 thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3540 thread->chosen_processor = PROCESSOR_NULL;
3541
3542 new_run_count = SCHED(run_count_decr)(thread);
3543
3544 #if CONFIG_SCHED_AUTO_JOIN
3545 if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3546 work_interval_auto_join_unwind(thread);
3547 }
3548 #endif /* CONFIG_SCHED_AUTO_JOIN */
3549
3550 #if CONFIG_SCHED_SFI
3551 if (thread->reason & AST_SFI) {
3552 thread->wait_sfi_begin_time = processor->last_dispatch;
3553 }
3554 #endif
3555 machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3556
3557 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3558 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3559 (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3560 new_run_count, 0);
3561
3562 if (thread_state & TH_WAIT_REPORT) {
3563 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3564 }
3565
3566 if (thread->wake_active) {
3567 thread->wake_active = FALSE;
3568 thread_unlock(thread);
3569
3570 thread_wakeup(&thread->wake_active);
3571 } else {
3572 thread_unlock(thread);
3573 }
3574
3575 wake_unlock(thread);
3576
3577 if (should_terminate) {
3578 thread_terminate_enqueue(thread);
3579 }
3580 }
3581 }
3582 /*
3583 * The thread could have been added to the termination queue, so it's
3584 * unsafe to use after this point.
3585 */
3586 thread = THREAD_NULL;
3587 }
3588
3589 int urgency = THREAD_URGENCY_NONE;
3590 uint64_t latency = 0;
3591
3592 /* Update (new) current thread and reprogram running timers */
3593 thread_lock(self);
3594
3595 if (!(self->state & TH_IDLE)) {
3596 uint64_t arg1, arg2;
3597
3598 #if CONFIG_SCHED_SFI
3599 ast_t new_ast;
3600
3601 new_ast = sfi_thread_needs_ast(self, NULL);
3602
3603 if (new_ast != AST_NONE) {
3604 ast_on(new_ast);
3605 }
3606 #endif
3607
3608 if (processor->last_dispatch < self->last_made_runnable_time) {
3609 panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3610 processor->last_dispatch, self->last_made_runnable_time);
3611 }
3612
3613 assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3614
3615 latency = processor->last_dispatch - self->last_made_runnable_time;
3616 assert(latency >= self->same_pri_latency);
3617
3618 urgency = thread_get_urgency(self, &arg1, &arg2);
3619
3620 thread_tell_urgency(urgency, arg1, arg2, latency, self);
3621
3622 /*
3623 * Get a new quantum if none remaining.
3624 */
3625 if (self->quantum_remaining == 0) {
3626 thread_quantum_init(self);
3627 }
3628
3629 /*
3630 * Set up quantum timer and timeslice.
3631 */
3632 processor->quantum_end = processor->last_dispatch +
3633 self->quantum_remaining;
3634
3635 running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
3636 processor->quantum_end, processor->last_dispatch);
3637 if (was_idle) {
3638 /*
3639 * kperf's running timer is active whenever the idle thread for a
3640 * CPU is not running.
3641 */
3642 kperf_running_setup(processor, processor->last_dispatch);
3643 }
3644 running_timers_activate(processor);
3645 processor->first_timeslice = TRUE;
3646 } else {
3647 running_timers_deactivate(processor);
3648 processor->first_timeslice = FALSE;
3649 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
3650 }
3651
3652 assert(self->block_hint == kThreadWaitNone);
3653 self->computation_epoch = processor->last_dispatch;
3654 self->reason = AST_NONE;
3655 processor->starting_pri = self->sched_pri;
3656
3657 thread_unlock(self);
3658
3659 machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
3660 processor->last_dispatch);
3661
3662 #if defined(CONFIG_SCHED_DEFERRED_AST)
3663 /*
3664 * TODO: Can we state that redispatching our old thread is also
3665 * uninteresting?
3666 */
3667 if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3668 pset_cancel_deferred_dispatch(processor->processor_set, processor);
3669 }
3670 #endif
3671 }
3672
3673 /*
3674 * thread_block_reason:
3675 *
3676 * Forces a reschedule, blocking the caller if a wait
3677 * has been asserted.
3678 *
3679 * If a continuation is specified, then thread_invoke will
3680 * attempt to discard the thread's kernel stack. When the
3681 * thread resumes, it will execute the continuation function
3682 * on a new kernel stack.
3683 */
3684 wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)3685 thread_block_reason(
3686 thread_continue_t continuation,
3687 void *parameter,
3688 ast_t reason)
3689 {
3690 thread_t self = current_thread();
3691 processor_t processor;
3692 thread_t new_thread;
3693 spl_t s;
3694
3695 s = splsched();
3696
3697 processor = current_processor();
3698
3699 /* If we're explicitly yielding, force a subsequent quantum */
3700 if (reason & AST_YIELD) {
3701 processor->first_timeslice = FALSE;
3702 }
3703
3704 /* We're handling all scheduling AST's */
3705 ast_off(AST_SCHEDULING);
3706
3707 #if PROC_REF_DEBUG
3708 if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3709 uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3710 }
3711 #endif
3712
3713 self->continuation = continuation;
3714 self->parameter = parameter;
3715
3716 if (self->state & ~(TH_RUN | TH_IDLE)) {
3717 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3718 MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3719 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3720 }
3721
3722 do {
3723 thread_lock(self);
3724 new_thread = thread_select(self, processor, &reason);
3725 thread_unlock(self);
3726 } while (!thread_invoke(self, new_thread, reason));
3727
3728 splx(s);
3729
3730 return self->wait_result;
3731 }
3732
3733 /*
3734 * thread_block:
3735 *
3736 * Block the current thread if a wait has been asserted.
3737 */
3738 wait_result_t
thread_block(thread_continue_t continuation)3739 thread_block(
3740 thread_continue_t continuation)
3741 {
3742 return thread_block_reason(continuation, NULL, AST_NONE);
3743 }
3744
3745 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)3746 thread_block_parameter(
3747 thread_continue_t continuation,
3748 void *parameter)
3749 {
3750 return thread_block_reason(continuation, parameter, AST_NONE);
3751 }
3752
3753 /*
3754 * thread_run:
3755 *
3756 * Switch directly from the current thread to the
3757 * new thread, handing off our quantum if appropriate.
3758 *
3759 * New thread must be runnable, and not on a run queue.
3760 *
3761 * Called at splsched.
3762 */
3763 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)3764 thread_run(
3765 thread_t self,
3766 thread_continue_t continuation,
3767 void *parameter,
3768 thread_t new_thread)
3769 {
3770 ast_t reason = AST_NONE;
3771
3772 if ((self->state & TH_IDLE) == 0) {
3773 reason = AST_HANDOFF;
3774 }
3775
3776 /*
3777 * If this thread hadn't been setrun'ed, it
3778 * might not have a chosen processor, so give it one
3779 */
3780 if (new_thread->chosen_processor == NULL) {
3781 new_thread->chosen_processor = current_processor();
3782 }
3783
3784 self->continuation = continuation;
3785 self->parameter = parameter;
3786
3787 while (!thread_invoke(self, new_thread, reason)) {
3788 /* the handoff failed, so we have to fall back to the normal block path */
3789 processor_t processor = current_processor();
3790
3791 reason = AST_NONE;
3792
3793 thread_lock(self);
3794 new_thread = thread_select(self, processor, &reason);
3795 thread_unlock(self);
3796 }
3797
3798 return self->wait_result;
3799 }
3800
3801 /*
3802 * thread_continue:
3803 *
3804 * Called at splsched when a thread first receives
3805 * a new stack after a continuation.
3806 *
3807 * Called with THREAD_NULL as the old thread when
3808 * invoked by machine_load_context.
3809 */
3810 void
thread_continue(thread_t thread)3811 thread_continue(
3812 thread_t thread)
3813 {
3814 thread_t self = current_thread();
3815 thread_continue_t continuation;
3816 void *parameter;
3817
3818 DTRACE_SCHED(on__cpu);
3819
3820 continuation = self->continuation;
3821 parameter = self->parameter;
3822
3823 assert(continuation != NULL);
3824
3825 #if KPERF
3826 kperf_on_cpu(self, continuation, NULL);
3827 #endif
3828
3829 thread_dispatch(thread, self);
3830
3831 self->continuation = self->parameter = NULL;
3832
3833 #if INTERRUPT_MASKED_DEBUG
3834 /* Reset interrupt-masked spin debugging timeout */
3835 ml_spin_debug_clear(self);
3836 #endif
3837
3838 TLOG(1, "thread_continue: calling call_continuation\n");
3839
3840 boolean_t enable_interrupts = TRUE;
3841
3842 /* bootstrap thread, idle thread need to stay interrupts-disabled */
3843 if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
3844 enable_interrupts = FALSE;
3845 }
3846
3847 call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
3848 /*NOTREACHED*/
3849 }
3850
3851 void
thread_quantum_init(thread_t thread)3852 thread_quantum_init(thread_t thread)
3853 {
3854 if (thread->sched_mode == TH_MODE_REALTIME) {
3855 thread->quantum_remaining = thread->realtime.computation;
3856 } else {
3857 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
3858 }
3859 }
3860
3861 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)3862 sched_timeshare_initial_quantum_size(thread_t thread)
3863 {
3864 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
3865 return bg_quantum;
3866 } else {
3867 return std_quantum;
3868 }
3869 }
3870
3871 /*
3872 * run_queue_init:
3873 *
3874 * Initialize a run queue before first use.
3875 */
3876 void
run_queue_init(run_queue_t rq)3877 run_queue_init(
3878 run_queue_t rq)
3879 {
3880 rq->highq = NOPRI;
3881 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
3882 rq->bitmap[i] = 0;
3883 }
3884 rq->urgency = rq->count = 0;
3885 for (int i = 0; i < NRQS; i++) {
3886 circle_queue_init(&rq->queues[i]);
3887 }
3888 }
3889
3890 /*
3891 * run_queue_dequeue:
3892 *
3893 * Perform a dequeue operation on a run queue,
3894 * and return the resulting thread.
3895 *
3896 * The run queue must be locked (see thread_run_queue_remove()
3897 * for more info), and not empty.
3898 */
3899 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)3900 run_queue_dequeue(
3901 run_queue_t rq,
3902 sched_options_t options)
3903 {
3904 thread_t thread;
3905 circle_queue_t queue = &rq->queues[rq->highq];
3906
3907 if (options & SCHED_HEADQ) {
3908 thread = cqe_dequeue_head(queue, struct thread, runq_links);
3909 } else {
3910 thread = cqe_dequeue_tail(queue, struct thread, runq_links);
3911 }
3912
3913 assert(thread != THREAD_NULL);
3914 assert_thread_magic(thread);
3915
3916 thread->runq = PROCESSOR_NULL;
3917 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3918 rq->count--;
3919 if (SCHED(priority_is_urgent)(rq->highq)) {
3920 rq->urgency--; assert(rq->urgency >= 0);
3921 }
3922 if (circle_queue_empty(queue)) {
3923 bitmap_clear(rq->bitmap, rq->highq);
3924 rq->highq = bitmap_first(rq->bitmap, NRQS);
3925 }
3926
3927 return thread;
3928 }
3929
3930 /*
3931 * run_queue_enqueue:
3932 *
3933 * Perform a enqueue operation on a run queue.
3934 *
3935 * The run queue must be locked (see thread_run_queue_remove()
3936 * for more info).
3937 */
3938 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)3939 run_queue_enqueue(
3940 run_queue_t rq,
3941 thread_t thread,
3942 sched_options_t options)
3943 {
3944 circle_queue_t queue = &rq->queues[thread->sched_pri];
3945 boolean_t result = FALSE;
3946
3947 assert_thread_magic(thread);
3948
3949 if (circle_queue_empty(queue)) {
3950 circle_enqueue_tail(queue, &thread->runq_links);
3951
3952 rq_bitmap_set(rq->bitmap, thread->sched_pri);
3953 if (thread->sched_pri > rq->highq) {
3954 rq->highq = thread->sched_pri;
3955 result = TRUE;
3956 }
3957 } else {
3958 if (options & SCHED_TAILQ) {
3959 circle_enqueue_tail(queue, &thread->runq_links);
3960 } else {
3961 circle_enqueue_head(queue, &thread->runq_links);
3962 }
3963 }
3964 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3965 rq->urgency++;
3966 }
3967 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3968 rq->count++;
3969
3970 return result;
3971 }
3972
3973 /*
3974 * run_queue_remove:
3975 *
3976 * Remove a specific thread from a runqueue.
3977 *
3978 * The run queue must be locked.
3979 */
3980 void
run_queue_remove(run_queue_t rq,thread_t thread)3981 run_queue_remove(
3982 run_queue_t rq,
3983 thread_t thread)
3984 {
3985 circle_queue_t queue = &rq->queues[thread->sched_pri];
3986
3987 assert(thread->runq != PROCESSOR_NULL);
3988 assert_thread_magic(thread);
3989
3990 circle_dequeue(queue, &thread->runq_links);
3991 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3992 rq->count--;
3993 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3994 rq->urgency--; assert(rq->urgency >= 0);
3995 }
3996
3997 if (circle_queue_empty(queue)) {
3998 /* update run queue status */
3999 bitmap_clear(rq->bitmap, thread->sched_pri);
4000 rq->highq = bitmap_first(rq->bitmap, NRQS);
4001 }
4002
4003 thread->runq = PROCESSOR_NULL;
4004 }
4005
4006 /*
4007 * run_queue_peek
4008 *
4009 * Peek at the runq and return the highest
4010 * priority thread from the runq.
4011 *
4012 * The run queue must be locked.
4013 */
4014 thread_t
run_queue_peek(run_queue_t rq)4015 run_queue_peek(
4016 run_queue_t rq)
4017 {
4018 if (rq->count > 0) {
4019 circle_queue_t queue = &rq->queues[rq->highq];
4020 thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4021 assert_thread_magic(thread);
4022 return thread;
4023 } else {
4024 return THREAD_NULL;
4025 }
4026 }
4027
4028 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)4029 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4030 {
4031 int pri = thread->sched_pri;
4032 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4033 int i = pri - BASEPRI_RTQUEUES;
4034 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4035 bitmap_t *map = rt_run_queue->bitmap;
4036
4037 bitmap_set(map, i);
4038
4039 queue_t queue = &rt_runq->pri_queue;
4040 uint64_t deadline = thread->realtime.deadline;
4041 bool preempt = false;
4042 bool earliest = false;
4043
4044 if (queue_empty(queue)) {
4045 enqueue_tail(queue, &thread->runq_links);
4046 preempt = true;
4047 earliest = true;
4048 rt_runq->pri_earliest_deadline = deadline;
4049 rt_runq->pri_constraint = thread->realtime.constraint;
4050 } else {
4051 /* Insert into rt_runq in thread deadline order */
4052 queue_entry_t iter;
4053 qe_foreach(iter, queue) {
4054 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4055 assert_thread_magic(iter_thread);
4056
4057 if (deadline < iter_thread->realtime.deadline) {
4058 if (iter == queue_first(queue)) {
4059 preempt = true;
4060 earliest = true;
4061 rt_runq->pri_earliest_deadline = deadline;
4062 rt_runq->pri_constraint = thread->realtime.constraint;
4063 }
4064 insque(&thread->runq_links, queue_prev(iter));
4065 break;
4066 } else if (iter == queue_last(queue)) {
4067 enqueue_tail(queue, &thread->runq_links);
4068 break;
4069 }
4070 }
4071 }
4072 if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4073 os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4074 os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4075 os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4076 }
4077
4078 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4079 rt_runq->pri_count++;
4080 os_atomic_inc(&rt_run_queue->count, relaxed);
4081
4082 thread->runq = processor;
4083
4084 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4085
4086 return preempt;
4087 }
4088
4089 static thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)4090 rt_runq_dequeue(rt_queue_t rt_run_queue)
4091 {
4092 bitmap_t *map = rt_run_queue->bitmap;
4093 int i = bitmap_first(map, NRTQS);
4094 assert((i >= 0) && (i < NRTQS));
4095
4096 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4097
4098 if (!sched_rt_runq_strict_priority) {
4099 int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4100 if (ed_index != i) {
4101 assert((ed_index >= 0) && (ed_index < NRTQS));
4102 rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4103
4104 thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4105 thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4106
4107 if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4108 /* choose the earliest deadline thread */
4109 rt_runq = ed_runq;
4110 i = ed_index;
4111 }
4112 }
4113 }
4114
4115 assert(rt_runq->pri_count > 0);
4116 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4117 uint32_t constraint = RT_CONSTRAINT_NONE;
4118 int ed_index = NOPRI;
4119 thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4120 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4121 if (--rt_runq->pri_count > 0) {
4122 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4123 assert(next_rt != THREAD_NULL);
4124 earliest_deadline = next_rt->realtime.deadline;
4125 constraint = next_rt->realtime.constraint;
4126 ed_index = i;
4127 } else {
4128 bitmap_clear(map, i);
4129 }
4130 rt_runq->pri_earliest_deadline = earliest_deadline;
4131 rt_runq->pri_constraint = constraint;
4132
4133 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4134 rt_runq = &rt_run_queue->rt_queue_pri[i];
4135 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4136 earliest_deadline = rt_runq->pri_earliest_deadline;
4137 constraint = rt_runq->pri_constraint;
4138 ed_index = i;
4139 }
4140 }
4141 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4142 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4143 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4144 os_atomic_dec(&rt_run_queue->count, relaxed);
4145
4146 new_thread->runq = PROCESSOR_NULL;
4147
4148 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4149
4150 return new_thread;
4151 }
4152
4153 static thread_t
rt_runq_first(rt_queue_t rt_run_queue)4154 rt_runq_first(rt_queue_t rt_run_queue)
4155 {
4156 bitmap_t *map = rt_run_queue->bitmap;
4157 int i = bitmap_first(map, NRTQS);
4158 if (i < 0) {
4159 return THREAD_NULL;
4160 }
4161 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4162 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4163
4164 return next_rt;
4165 }
4166
4167 static void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)4168 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4169 {
4170 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4171
4172 int pri = thread->sched_pri;
4173 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4174 int i = pri - BASEPRI_RTQUEUES;
4175 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4176 bitmap_t *map = rt_run_queue->bitmap;
4177
4178 assert(rt_runq->pri_count > 0);
4179 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4180 uint32_t constraint = RT_CONSTRAINT_NONE;
4181 int ed_index = NOPRI;
4182 remqueue(&thread->runq_links);
4183 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4184 if (--rt_runq->pri_count > 0) {
4185 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4186 earliest_deadline = next_rt->realtime.deadline;
4187 constraint = next_rt->realtime.constraint;
4188 ed_index = i;
4189 } else {
4190 bitmap_clear(map, i);
4191 }
4192 rt_runq->pri_earliest_deadline = earliest_deadline;
4193 rt_runq->pri_constraint = constraint;
4194
4195 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4196 rt_runq = &rt_run_queue->rt_queue_pri[i];
4197 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4198 earliest_deadline = rt_runq->pri_earliest_deadline;
4199 constraint = rt_runq->pri_constraint;
4200 ed_index = i;
4201 }
4202 }
4203 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4204 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4205 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4206 os_atomic_dec(&rt_run_queue->count, relaxed);
4207
4208 thread->runq = PROCESSOR_NULL;
4209
4210 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4211 }
4212
4213 rt_queue_t
sched_rtlocal_runq(processor_set_t pset)4214 sched_rtlocal_runq(processor_set_t pset)
4215 {
4216 return &pset->rt_runq;
4217 }
4218
4219 void
sched_rtlocal_init(processor_set_t pset)4220 sched_rtlocal_init(processor_set_t pset)
4221 {
4222 pset_rt_init(pset);
4223 }
4224
4225 void
sched_rtlocal_queue_shutdown(processor_t processor)4226 sched_rtlocal_queue_shutdown(processor_t processor)
4227 {
4228 processor_set_t pset = processor->processor_set;
4229 thread_t thread;
4230 queue_head_t tqueue;
4231
4232 pset_lock(pset);
4233
4234 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4235 if (bit_count(pset_available_cpumap(pset)) > 0) {
4236 pset_unlock(pset);
4237 return;
4238 }
4239
4240 queue_init(&tqueue);
4241
4242 while (rt_runq_count(pset) > 0) {
4243 thread = rt_runq_dequeue(&pset->rt_runq);
4244 enqueue_tail(&tqueue, &thread->runq_links);
4245 }
4246 sched_update_pset_load_average(pset, 0);
4247 pset_update_rt_stealable_state(pset);
4248 pset_unlock(pset);
4249
4250 qe_foreach_element_safe(thread, &tqueue, runq_links) {
4251 remqueue(&thread->runq_links);
4252
4253 thread_lock(thread);
4254
4255 thread_setrun(thread, SCHED_TAILQ);
4256
4257 thread_unlock(thread);
4258 }
4259 }
4260
4261 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4262 void
sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)4263 sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4264 {
4265 thread_t thread;
4266
4267 pset_node_t node = &pset_node0;
4268 processor_set_t pset = node->psets;
4269
4270 spl_t s = splsched();
4271 do {
4272 while (pset != NULL) {
4273 pset_lock(pset);
4274
4275 bitmap_t *map = pset->rt_runq.bitmap;
4276 for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4277 rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4278
4279 qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4280 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4281 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4282 }
4283 }
4284 }
4285
4286 pset_unlock(pset);
4287
4288 pset = pset->pset_list;
4289 }
4290 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4291 splx(s);
4292 }
4293
4294 int64_t
sched_rtlocal_runq_count_sum(void)4295 sched_rtlocal_runq_count_sum(void)
4296 {
4297 pset_node_t node = &pset_node0;
4298 processor_set_t pset = node->psets;
4299 int64_t count = 0;
4300
4301 do {
4302 while (pset != NULL) {
4303 count += pset->rt_runq.runq_stats.count_sum;
4304
4305 pset = pset->pset_list;
4306 }
4307 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4308
4309 return count;
4310 }
4311
4312 /*
4313 * Called with stealing_pset locked and
4314 * returns with stealing_pset locked
4315 * but the lock will have been dropped
4316 * if a thread is returned.
4317 */
4318 thread_t
sched_rtlocal_steal_thread(processor_set_t stealing_pset,uint64_t earliest_deadline)4319 sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4320 {
4321 if (!sched_allow_rt_steal) {
4322 return THREAD_NULL;
4323 }
4324 pset_map_t pset_map = stealing_pset->node->pset_map;
4325
4326 bit_clear(pset_map, stealing_pset->pset_id);
4327
4328 processor_set_t pset = stealing_pset;
4329
4330 processor_set_t target_pset;
4331 uint64_t target_deadline;
4332
4333 retry:
4334 target_pset = NULL;
4335 target_deadline = earliest_deadline - rt_deadline_epsilon;
4336
4337 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
4338 processor_set_t nset = pset_array[pset_id];
4339
4340 if (nset->stealable_rt_threads_earliest_deadline < target_deadline) {
4341 target_deadline = nset->stealable_rt_threads_earliest_deadline;
4342 target_pset = nset;
4343 }
4344 }
4345
4346 if (target_pset != NULL) {
4347 pset = change_locked_pset(pset, target_pset);
4348 if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4349 thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
4350 pset_update_rt_stealable_state(pset);
4351 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4352
4353 pset = change_locked_pset(pset, stealing_pset);
4354 return new_thread;
4355 }
4356 pset = change_locked_pset(pset, stealing_pset);
4357 earliest_deadline = rt_runq_earliest_deadline(pset);
4358 goto retry;
4359 }
4360
4361 pset = change_locked_pset(pset, stealing_pset);
4362 return THREAD_NULL;
4363 }
4364
4365 /*
4366 * pset is locked
4367 */
4368 thread_t
sched_rt_choose_thread(processor_set_t pset)4369 sched_rt_choose_thread(processor_set_t pset)
4370 {
4371 processor_t processor = current_processor();
4372 uint64_t rt_ll_deadline = 0;
4373 if (rt_constraint_ll != 0) {
4374 rt_ll_deadline = rt_constraint_ll + mach_absolute_time();
4375 }
4376
4377 if (rt_runq_earliest_deadline(pset) < rt_ll_deadline) {
4378 thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4379 pset_update_rt_stealable_state(pset);
4380 assert(new_thread != THREAD_NULL);
4381 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4382 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 1);
4383 }
4384 return new_thread;
4385 }
4386
4387 if (SCHED(steal_thread_enabled)(pset)) {
4388 do {
4389 bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4390 if (spill_pending) {
4391 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4392 }
4393 thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4394 if (new_thread != THREAD_NULL) {
4395 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4396 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4397 }
4398 return new_thread;
4399 }
4400 } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4401 }
4402
4403 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4404 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4405 }
4406
4407 if (rt_runq_count(pset) > 0) {
4408 thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4409 assert(new_thread != THREAD_NULL);
4410 pset_update_rt_stealable_state(pset);
4411 return new_thread;
4412 }
4413
4414 return THREAD_NULL;
4415 }
4416
4417 /*
4418 * realtime_queue_insert:
4419 *
4420 * Enqueue a thread for realtime execution.
4421 */
4422 static bool
realtime_queue_insert(processor_t processor,processor_set_t pset,thread_t thread)4423 realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4424 {
4425 pset_assert_locked(pset);
4426
4427 bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4428 pset_update_rt_stealable_state(pset);
4429
4430 return preempt;
4431 }
4432
4433 /*
4434 * realtime_setrun:
4435 *
4436 * Dispatch a thread for realtime execution.
4437 *
4438 * Thread must be locked. Associated pset must
4439 * be locked, and is returned unlocked.
4440 */
4441 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4442 realtime_setrun(
4443 processor_t chosen_processor,
4444 thread_t thread)
4445 {
4446 processor_set_t pset = chosen_processor->processor_set;
4447 pset_assert_locked(pset);
4448 bool pset_is_locked = true;
4449
4450 int n_backup = 0;
4451
4452 if (thread->realtime.constraint <= rt_constraint_threshold) {
4453 n_backup = sched_rt_n_backup_processors;
4454 }
4455 assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4456
4457 int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4458 if (existing_backups > 0) {
4459 n_backup = n_backup - existing_backups;
4460 if (n_backup < 0) {
4461 n_backup = 0;
4462 }
4463 }
4464
4465 sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4466 processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4467
4468 thread->chosen_processor = chosen_processor;
4469
4470 /* <rdar://problem/15102234> */
4471 assert(thread->bound_processor == PROCESSOR_NULL);
4472
4473 realtime_queue_insert(chosen_processor, pset, thread);
4474
4475 processor_t processor = chosen_processor;
4476
4477 int count = 0;
4478 for (int i = 0; i <= n_backup; i++) {
4479 if (i == 0) {
4480 ipi_type[i] = SCHED_IPI_NONE;
4481 ipi_processor[i] = processor;
4482 count++;
4483
4484 ast_t preempt = AST_NONE;
4485 if (thread->sched_pri > processor->current_pri) {
4486 preempt = (AST_PREEMPT | AST_URGENT);
4487 } else if (thread->sched_pri == processor->current_pri) {
4488 if (thread->realtime.constraint <= rt_constraint_ll) {
4489 preempt = (AST_PREEMPT | AST_URGENT);
4490 } else if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4491 preempt = (AST_PREEMPT | AST_URGENT);
4492 }
4493 }
4494
4495 if (preempt != AST_NONE) {
4496 if (processor->state == PROCESSOR_IDLE) {
4497 if (processor == current_processor()) {
4498 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4499 ast_on(preempt);
4500
4501 if ((preempt & AST_URGENT) == AST_URGENT) {
4502 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4503 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4504 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4505 }
4506 }
4507
4508 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4509 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4510 }
4511 } else {
4512 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4513 }
4514 } else if (processor->state == PROCESSOR_DISPATCHING) {
4515 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4516 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4517 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4518 }
4519 } else {
4520 if (processor == current_processor()) {
4521 ast_on(preempt);
4522
4523 if ((preempt & AST_URGENT) == AST_URGENT) {
4524 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4525 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4526 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4527 }
4528 }
4529
4530 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4531 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4532 }
4533 } else {
4534 ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4535 }
4536 }
4537 } else {
4538 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4539 }
4540 } else {
4541 if (!pset_is_locked) {
4542 pset_lock(pset);
4543 }
4544 ipi_type[i] = SCHED_IPI_NONE;
4545 ipi_processor[i] = PROCESSOR_NULL;
4546 pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]);
4547 if (ipi_processor[i] == PROCESSOR_NULL) {
4548 break;
4549 }
4550 count++;
4551
4552 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4553 ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4554 #if defined(__x86_64__)
4555 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4556 if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4557 processor_t p0 = ipi_processor[0];
4558 processor_t p1 = ipi_processor[1];
4559 assert(p0 && p1);
4560 if (p_is_good(p0) && p_is_good(p1)) {
4561 /*
4562 * Both the chosen processor and the first backup are non-cpu0 primaries,
4563 * so there is no need for a 2nd backup processor.
4564 */
4565 break;
4566 }
4567 }
4568 #endif
4569 }
4570 }
4571
4572 if (pset_is_locked) {
4573 pset_unlock(pset);
4574 }
4575
4576 assert((count > 0) && (count <= (n_backup + 1)));
4577 for (int i = 0; i < count; i++) {
4578 assert(ipi_processor[i] != PROCESSOR_NULL);
4579 sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4580 }
4581 }
4582
4583
4584 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)4585 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4586 thread_t thread, __unused sched_ipi_event_t event)
4587 {
4588 #if defined(CONFIG_SCHED_DEFERRED_AST)
4589 #if CONFIG_THREAD_GROUPS
4590 if (thread) {
4591 struct thread_group *tg = thread_group_get(thread);
4592 if (thread_group_uses_immediate_ipi(tg)) {
4593 return SCHED_IPI_IMMEDIATE;
4594 }
4595 }
4596 #endif /* CONFIG_THREAD_GROUPS */
4597 if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4598 return SCHED_IPI_DEFERRED;
4599 }
4600 #else /* CONFIG_SCHED_DEFERRED_AST */
4601 (void) thread;
4602 panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4603 #endif /* CONFIG_SCHED_DEFERRED_AST */
4604 return SCHED_IPI_NONE;
4605 }
4606
4607 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)4608 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4609 {
4610 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4611 assert(dst != NULL);
4612
4613 processor_set_t pset = dst->processor_set;
4614 if (current_processor() == dst) {
4615 return SCHED_IPI_NONE;
4616 }
4617
4618 bool dst_idle = (dst->state == PROCESSOR_IDLE);
4619 if (dst_idle) {
4620 pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
4621 }
4622
4623 ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4624 switch (ipi_type) {
4625 case SCHED_IPI_NONE:
4626 return SCHED_IPI_NONE;
4627 #if defined(CONFIG_SCHED_DEFERRED_AST)
4628 case SCHED_IPI_DEFERRED:
4629 bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4630 break;
4631 #endif /* CONFIG_SCHED_DEFERRED_AST */
4632 default:
4633 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4634 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4635 dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4636 }
4637 bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4638 break;
4639 }
4640 return ipi_type;
4641 }
4642
4643 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4644 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4645 {
4646 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4647 boolean_t deferred_ipi_supported = false;
4648 processor_set_t pset = dst->processor_set;
4649
4650 #if defined(CONFIG_SCHED_DEFERRED_AST)
4651 deferred_ipi_supported = true;
4652 #endif /* CONFIG_SCHED_DEFERRED_AST */
4653
4654 switch (event) {
4655 case SCHED_IPI_EVENT_SPILL:
4656 case SCHED_IPI_EVENT_SMT_REBAL:
4657 case SCHED_IPI_EVENT_REBALANCE:
4658 case SCHED_IPI_EVENT_BOUND_THR:
4659 case SCHED_IPI_EVENT_RT_PREEMPT:
4660 /*
4661 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4662 * scenarios use immediate IPIs always.
4663 */
4664 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4665 break;
4666 case SCHED_IPI_EVENT_PREEMPT:
4667 /* In the preemption case, use immediate IPIs for RT threads */
4668 if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4669 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4670 break;
4671 }
4672
4673 /*
4674 * For Non-RT threads preemption,
4675 * If the core is active, use immediate IPIs.
4676 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4677 */
4678 if (deferred_ipi_supported && dst_idle) {
4679 return sched_ipi_deferred_policy(pset, dst, thread, event);
4680 }
4681 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4682 break;
4683 default:
4684 panic("Unrecognized scheduler IPI event type %d", event);
4685 }
4686 assert(ipi_type != SCHED_IPI_NONE);
4687 return ipi_type;
4688 }
4689
4690 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)4691 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4692 {
4693 switch (ipi) {
4694 case SCHED_IPI_NONE:
4695 break;
4696 case SCHED_IPI_IDLE:
4697 machine_signal_idle(dst);
4698 break;
4699 case SCHED_IPI_IMMEDIATE:
4700 cause_ast_check(dst);
4701 break;
4702 case SCHED_IPI_DEFERRED:
4703 machine_signal_idle_deferred(dst);
4704 break;
4705 default:
4706 panic("Unrecognized scheduler IPI type: %d", ipi);
4707 }
4708 }
4709
4710 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4711
4712 boolean_t
priority_is_urgent(int priority)4713 priority_is_urgent(int priority)
4714 {
4715 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
4716 }
4717
4718 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4719
4720 /*
4721 * processor_setrun:
4722 *
4723 * Dispatch a thread for execution on a
4724 * processor.
4725 *
4726 * Thread must be locked. Associated pset must
4727 * be locked, and is returned unlocked.
4728 */
4729 static void
processor_setrun(processor_t processor,thread_t thread,integer_t options)4730 processor_setrun(
4731 processor_t processor,
4732 thread_t thread,
4733 integer_t options)
4734 {
4735 processor_set_t pset = processor->processor_set;
4736 pset_assert_locked(pset);
4737 ast_t preempt;
4738 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4739
4740 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4741
4742 thread->chosen_processor = processor;
4743
4744 /*
4745 * Set preemption mode.
4746 */
4747 #if defined(CONFIG_SCHED_DEFERRED_AST)
4748 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4749 #endif
4750 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4751 preempt = (AST_PREEMPT | AST_URGENT);
4752 } else if (processor->current_is_eagerpreempt) {
4753 preempt = (AST_PREEMPT | AST_URGENT);
4754 } else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4755 if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4756 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4757 } else {
4758 preempt = AST_NONE;
4759 }
4760 } else {
4761 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4762 }
4763
4764 if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
4765 /*
4766 * Having gone to the trouble of forcing this thread off a less preferred core,
4767 * we should force the preferable core to reschedule immediately to give this
4768 * thread a chance to run instead of just sitting on the run queue where
4769 * it may just be stolen back by the idle core we just forced it off.
4770 */
4771 preempt |= AST_PREEMPT;
4772 }
4773
4774 SCHED(processor_enqueue)(processor, thread, options);
4775 sched_update_pset_load_average(pset, 0);
4776
4777 if (preempt != AST_NONE) {
4778 if (processor->state == PROCESSOR_IDLE) {
4779 ipi_action = eExitIdle;
4780 } else if (processor->state == PROCESSOR_DISPATCHING) {
4781 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4782 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4783 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4784 }
4785 } else if ((processor->state == PROCESSOR_RUNNING ||
4786 processor->state == PROCESSOR_SHUTDOWN) &&
4787 (thread->sched_pri >= processor->current_pri)) {
4788 ipi_action = eInterruptRunning;
4789 }
4790 } else {
4791 /*
4792 * New thread is not important enough to preempt what is running, but
4793 * special processor states may need special handling
4794 */
4795 if (processor->state == PROCESSOR_SHUTDOWN &&
4796 thread->sched_pri >= processor->current_pri) {
4797 ipi_action = eInterruptRunning;
4798 } else if (processor->state == PROCESSOR_IDLE) {
4799 ipi_action = eExitIdle;
4800 } else if (processor->state == PROCESSOR_DISPATCHING) {
4801 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4802 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4803 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
4804 }
4805 }
4806 }
4807
4808 if (ipi_action != eDoNothing) {
4809 if (processor == current_processor()) {
4810 if (ipi_action == eExitIdle) {
4811 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4812 }
4813 if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
4814 ast_on(preempt);
4815 }
4816
4817 if ((preempt & AST_URGENT) == AST_URGENT) {
4818 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4819 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4820 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
4821 }
4822 } else {
4823 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4824 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
4825 }
4826 }
4827
4828 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4829 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4830 } else {
4831 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4832 }
4833 } else {
4834 sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
4835 ipi_type = sched_ipi_action(processor, thread, event);
4836 }
4837 }
4838 pset_unlock(pset);
4839 sched_ipi_perform(processor, ipi_type);
4840 }
4841
4842 /*
4843 * choose_next_pset:
4844 *
4845 * Return the next sibling pset containing
4846 * available processors.
4847 *
4848 * Returns the original pset if none other is
4849 * suitable.
4850 */
4851 static processor_set_t
choose_next_pset(processor_set_t pset)4852 choose_next_pset(
4853 processor_set_t pset)
4854 {
4855 processor_set_t nset = pset;
4856
4857 do {
4858 nset = next_pset(nset);
4859 } while (nset->online_processor_count < 1 && nset != pset);
4860
4861 return nset;
4862 }
4863
4864 /*
4865 * choose_processor:
4866 *
4867 * Choose a processor for the thread, beginning at
4868 * the pset. Accepts an optional processor hint in
4869 * the pset.
4870 *
4871 * Returns a processor, possibly from a different pset.
4872 *
4873 * The thread must be locked. The pset must be locked,
4874 * and the resulting pset is locked on return.
4875 */
4876 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread)4877 choose_processor(
4878 processor_set_t starting_pset,
4879 processor_t processor,
4880 thread_t thread)
4881 {
4882 processor_set_t pset = starting_pset;
4883 processor_set_t nset;
4884
4885 assert(thread->sched_pri <= MAXPRI);
4886
4887 /*
4888 * Prefer the hinted processor, when appropriate.
4889 */
4890
4891 /* Fold last processor hint from secondary processor to its primary */
4892 if (processor != PROCESSOR_NULL) {
4893 processor = processor->processor_primary;
4894 }
4895
4896 /*
4897 * Only consult platform layer if pset is active, which
4898 * it may not be in some cases when a multi-set system
4899 * is going to sleep.
4900 */
4901 if (pset->online_processor_count) {
4902 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
4903 processor_t mc_processor = machine_choose_processor(pset, processor);
4904 if (mc_processor != PROCESSOR_NULL) {
4905 processor = mc_processor->processor_primary;
4906 }
4907 }
4908 }
4909
4910 /*
4911 * At this point, we may have a processor hint, and we may have
4912 * an initial starting pset. If the hint is not in the pset, or
4913 * if the hint is for a processor in an invalid state, discard
4914 * the hint.
4915 */
4916 if (processor != PROCESSOR_NULL) {
4917 if (processor->processor_set != pset) {
4918 processor = PROCESSOR_NULL;
4919 } else if (!processor->is_recommended) {
4920 processor = PROCESSOR_NULL;
4921 } else {
4922 switch (processor->state) {
4923 case PROCESSOR_START:
4924 case PROCESSOR_SHUTDOWN:
4925 case PROCESSOR_OFF_LINE:
4926 /*
4927 * Hint is for a processor that cannot support running new threads.
4928 */
4929 processor = PROCESSOR_NULL;
4930 break;
4931 case PROCESSOR_IDLE:
4932 /*
4933 * Hint is for an idle processor. Assume it is no worse than any other
4934 * idle processor. The platform layer had an opportunity to provide
4935 * the "least cost idle" processor above.
4936 */
4937 if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
4938 return processor;
4939 }
4940 processor = PROCESSOR_NULL;
4941 break;
4942 case PROCESSOR_RUNNING:
4943 case PROCESSOR_DISPATCHING:
4944 /*
4945 * Hint is for an active CPU. This fast-path allows
4946 * realtime threads to preempt non-realtime threads
4947 * to regain their previous executing processor.
4948 */
4949 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4950 if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
4951 return processor;
4952 }
4953 processor = PROCESSOR_NULL;
4954 }
4955
4956 /* Otherwise, use hint as part of search below */
4957 break;
4958 default:
4959 processor = PROCESSOR_NULL;
4960 break;
4961 }
4962 }
4963 }
4964
4965 /*
4966 * Iterate through the processor sets to locate
4967 * an appropriate processor. Seed results with
4968 * a last-processor hint, if available, so that
4969 * a search must find something strictly better
4970 * to replace it.
4971 *
4972 * A primary/secondary pair of SMT processors are
4973 * "unpaired" if the primary is busy but its
4974 * corresponding secondary is idle (so the physical
4975 * core has full use of its resources).
4976 */
4977
4978 integer_t lowest_priority = MAXPRI + 1;
4979 integer_t lowest_secondary_priority = MAXPRI + 1;
4980 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
4981 integer_t lowest_idle_secondary_priority = MAXPRI + 1;
4982 integer_t lowest_count = INT_MAX;
4983 uint64_t furthest_deadline = 1;
4984 processor_t lp_processor = PROCESSOR_NULL;
4985 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
4986 processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
4987 processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
4988 processor_t lc_processor = PROCESSOR_NULL;
4989 processor_t fd_processor = PROCESSOR_NULL;
4990
4991 if (processor != PROCESSOR_NULL) {
4992 /* All other states should be enumerated above. */
4993 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
4994 assert(thread->sched_pri < BASEPRI_RTQUEUES);
4995
4996 lowest_priority = processor->current_pri;
4997 lp_processor = processor;
4998
4999 lowest_count = SCHED(processor_runq_count)(processor);
5000 lc_processor = processor;
5001 }
5002
5003 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5004 pset_node_t node = pset->node;
5005 bool include_ast_urgent_pending_cpus = false;
5006 cpumap_t ast_urgent_pending;
5007 try_again:
5008 ast_urgent_pending = 0;
5009 int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
5010 for (; consider_secondaries < 2; consider_secondaries++) {
5011 pset = change_locked_pset(pset, starting_pset);
5012 do {
5013 cpumap_t available_map = pset_available_cpumap(pset);
5014 if (available_map == 0) {
5015 goto no_available_cpus;
5016 }
5017
5018 processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5019 if (processor) {
5020 return processor;
5021 }
5022
5023 if (consider_secondaries) {
5024 processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5025 if (processor && (processor->deadline > furthest_deadline)) {
5026 fd_processor = processor;
5027 furthest_deadline = processor->deadline;
5028 if (sched_choose_first_fd_processor && ((rt_constraint_ll == 0) || (furthest_deadline > rt_constraint_ll + mach_absolute_time()))) {
5029 /*
5030 * Instead of looping through all the psets to find the global
5031 * furthest deadline processor, preempt the first candidate found.
5032 * The preempted thread will then find any other available far deadline
5033 * processors to preempt.
5034 */
5035 return fd_processor;
5036 }
5037 }
5038
5039 ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5040
5041 if (rt_runq_count(pset) < lowest_count) {
5042 int cpuid = bit_first(available_map);
5043 assert(cpuid >= 0);
5044 lc_processor = processor_array[cpuid];
5045 lowest_count = rt_runq_count(pset);
5046 }
5047 }
5048
5049 no_available_cpus:
5050 nset = next_pset(pset);
5051
5052 if (nset != starting_pset) {
5053 pset = change_locked_pset(pset, nset);
5054 }
5055 } while (nset != starting_pset);
5056 }
5057
5058 /* Short cut for single pset nodes */
5059 if (bit_count(node->pset_map) == 1) {
5060 if (fd_processor) {
5061 pset_assert_locked(fd_processor->processor_set);
5062 return fd_processor;
5063 } else if (lc_processor) {
5064 pset_assert_locked(lc_processor->processor_set);
5065 return lc_processor;
5066 }
5067 } else {
5068 if ((fd_processor == PROCESSOR_NULL) && ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5069 /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5070 include_ast_urgent_pending_cpus = true;
5071 goto try_again;
5072 }
5073 }
5074
5075 processor = PROCESSOR_NULL;
5076 if (fd_processor) {
5077 processor = fd_processor;
5078 } else if (lc_processor) {
5079 processor = lc_processor;
5080 }
5081
5082 if (processor) {
5083 pset = change_locked_pset(pset, processor->processor_set);
5084 /* Check that chosen processor is still usable */
5085 cpumap_t available_map = pset_available_cpumap(pset);
5086 if (bit_test(available_map, processor->cpu_id)) {
5087 return processor;
5088 }
5089
5090 /* processor is no longer usable */
5091 processor = PROCESSOR_NULL;
5092 }
5093
5094 pset_assert_locked(pset);
5095 pset_unlock(pset);
5096 return PROCESSOR_NULL;
5097 }
5098
5099 /* No realtime threads from this point on */
5100 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5101
5102 do {
5103 /*
5104 * Choose an idle processor, in pset traversal order
5105 */
5106
5107 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5108 pset->primary_map &
5109 pset->recommended_bitmask);
5110
5111 /* there shouldn't be a pending AST if the processor is idle */
5112 assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5113
5114 int cpuid = lsb_first(idle_primary_map);
5115 if (cpuid >= 0) {
5116 processor = processor_array[cpuid];
5117 return processor;
5118 }
5119
5120 /*
5121 * Otherwise, enumerate active and idle processors to find primary candidates
5122 * with lower priority/etc.
5123 */
5124
5125 uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5126 pset->recommended_bitmask &
5127 ~pset->pending_AST_URGENT_cpu_mask);
5128
5129 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5130 active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5131 }
5132
5133 active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5134 for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5135 cpuid = ((rotid + pset->last_chosen + 1) & 63);
5136 processor = processor_array[cpuid];
5137
5138 integer_t cpri = processor->current_pri;
5139 processor_t primary = processor->processor_primary;
5140 if (primary != processor) {
5141 /* If primary is running a NO_SMT thread, don't choose its secondary */
5142 if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
5143 if (cpri < lowest_secondary_priority) {
5144 lowest_secondary_priority = cpri;
5145 lp_paired_secondary_processor = processor;
5146 }
5147 }
5148 } else {
5149 if (cpri < lowest_priority) {
5150 lowest_priority = cpri;
5151 lp_processor = processor;
5152 }
5153 }
5154
5155 integer_t ccount = SCHED(processor_runq_count)(processor);
5156 if (ccount < lowest_count) {
5157 lowest_count = ccount;
5158 lc_processor = processor;
5159 }
5160 }
5161
5162 /*
5163 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5164 * the idle primary would have short-circuited the loop above
5165 */
5166 uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5167 ~pset->primary_map &
5168 pset->recommended_bitmask);
5169
5170 /* there shouldn't be a pending AST if the processor is idle */
5171 assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5172 assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5173
5174 for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
5175 processor = processor_array[cpuid];
5176
5177 processor_t cprimary = processor->processor_primary;
5178
5179 integer_t primary_pri = cprimary->current_pri;
5180
5181 /*
5182 * TODO: This should also make the same decisions
5183 * as secondary_can_run_realtime_thread
5184 *
5185 * TODO: Keep track of the pending preemption priority
5186 * of the primary to make this more accurate.
5187 */
5188
5189 /* If the primary is running a no-smt thread, then don't choose its secondary */
5190 if (cprimary->state == PROCESSOR_RUNNING &&
5191 processor_active_thread_no_smt(cprimary)) {
5192 continue;
5193 }
5194
5195 /*
5196 * Find the idle secondary processor with the lowest priority primary
5197 *
5198 * We will choose this processor as a fallback if we find no better
5199 * primary to preempt.
5200 */
5201 if (primary_pri < lowest_idle_secondary_priority) {
5202 lp_idle_secondary_processor = processor;
5203 lowest_idle_secondary_priority = primary_pri;
5204 }
5205
5206 /* Find the the lowest priority active primary with idle secondary */
5207 if (primary_pri < lowest_unpaired_primary_priority) {
5208 /* If the primary processor is offline or starting up, it's not a candidate for this path */
5209 if (cprimary->state != PROCESSOR_RUNNING &&
5210 cprimary->state != PROCESSOR_DISPATCHING) {
5211 continue;
5212 }
5213
5214 if (!cprimary->is_recommended) {
5215 continue;
5216 }
5217
5218 /* if the primary is pending preemption, don't try to re-preempt it */
5219 if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5220 continue;
5221 }
5222
5223 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5224 bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5225 continue;
5226 }
5227
5228 lowest_unpaired_primary_priority = primary_pri;
5229 lp_unpaired_primary_processor = cprimary;
5230 }
5231 }
5232
5233 /*
5234 * We prefer preempting a primary processor over waking up its secondary.
5235 * The secondary will then be woken up by the preempted thread.
5236 */
5237 if (thread->sched_pri > lowest_unpaired_primary_priority) {
5238 pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5239 return lp_unpaired_primary_processor;
5240 }
5241
5242 /*
5243 * We prefer preempting a lower priority active processor over directly
5244 * waking up an idle secondary.
5245 * The preempted thread will then find the idle secondary.
5246 */
5247 if (thread->sched_pri > lowest_priority) {
5248 pset->last_chosen = lp_processor->cpu_id;
5249 return lp_processor;
5250 }
5251
5252 /*
5253 * lc_processor is used to indicate the best processor set run queue
5254 * on which to enqueue a thread when all available CPUs are busy with
5255 * higher priority threads, so try to make sure it is initialized.
5256 */
5257 if (lc_processor == PROCESSOR_NULL) {
5258 cpumap_t available_map = pset_available_cpumap(pset);
5259 cpuid = lsb_first(available_map);
5260 if (cpuid >= 0) {
5261 lc_processor = processor_array[cpuid];
5262 lowest_count = SCHED(processor_runq_count)(lc_processor);
5263 }
5264 }
5265
5266 /*
5267 * Move onto the next processor set.
5268 *
5269 * If all primary processors in this pset are running a higher
5270 * priority thread, move on to next pset. Only when we have
5271 * exhausted the search for primary processors do we
5272 * fall back to secondaries.
5273 */
5274 #if CONFIG_SCHED_EDGE
5275 /*
5276 * The edge scheduler expects a CPU to be selected from the pset it passed in
5277 * as the starting pset for non-RT workloads. The edge migration algorithm
5278 * should already have considered idle CPUs and loads to decide the starting_pset;
5279 * which means that this loop can be short-circuted.
5280 */
5281 nset = starting_pset;
5282 #else /* CONFIG_SCHED_EDGE */
5283 nset = next_pset(pset);
5284 #endif /* CONFIG_SCHED_EDGE */
5285
5286 if (nset != starting_pset) {
5287 pset = change_locked_pset(pset, nset);
5288 }
5289 } while (nset != starting_pset);
5290
5291 /*
5292 * Make sure that we pick a running processor,
5293 * and that the correct processor set is locked.
5294 * Since we may have unlocked the candidate processor's
5295 * pset, it may have changed state.
5296 *
5297 * All primary processors are running a higher priority
5298 * thread, so the only options left are enqueuing on
5299 * the secondary processor that would perturb the least priority
5300 * primary, or the least busy primary.
5301 */
5302
5303 /* lowest_priority is evaluated in the main loops above */
5304 if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5305 processor = lp_idle_secondary_processor;
5306 } else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5307 processor = lp_paired_secondary_processor;
5308 } else if (lc_processor != PROCESSOR_NULL) {
5309 processor = lc_processor;
5310 } else {
5311 processor = PROCESSOR_NULL;
5312 }
5313
5314 if (processor) {
5315 pset = change_locked_pset(pset, processor->processor_set);
5316 /* Check that chosen processor is still usable */
5317 cpumap_t available_map = pset_available_cpumap(pset);
5318 if (bit_test(available_map, processor->cpu_id)) {
5319 pset->last_chosen = processor->cpu_id;
5320 return processor;
5321 }
5322
5323 /* processor is no longer usable */
5324 processor = PROCESSOR_NULL;
5325 }
5326
5327 pset_assert_locked(pset);
5328 pset_unlock(pset);
5329 return PROCESSOR_NULL;
5330 }
5331
5332 /*
5333 * Default implementation of SCHED(choose_node)()
5334 * for single node systems
5335 */
5336 pset_node_t
sched_choose_node(__unused thread_t thread)5337 sched_choose_node(__unused thread_t thread)
5338 {
5339 return &pset_node0;
5340 }
5341
5342 /*
5343 * choose_starting_pset:
5344 *
5345 * Choose a starting processor set for the thread.
5346 * May return a processor hint within the pset.
5347 *
5348 * Returns a starting processor set, to be used by
5349 * choose_processor.
5350 *
5351 * The thread must be locked. The resulting pset is unlocked on return,
5352 * and is chosen without taking any pset locks.
5353 */
5354 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)5355 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5356 {
5357 processor_set_t pset;
5358 processor_t processor = PROCESSOR_NULL;
5359
5360 if (thread->affinity_set != AFFINITY_SET_NULL) {
5361 /*
5362 * Use affinity set policy hint.
5363 */
5364 pset = thread->affinity_set->aset_pset;
5365 } else if (thread->last_processor != PROCESSOR_NULL) {
5366 /*
5367 * Simple (last processor) affinity case.
5368 */
5369 processor = thread->last_processor;
5370 pset = processor->processor_set;
5371 } else {
5372 /*
5373 * No Affinity case:
5374 *
5375 * Utilitize a per task hint to spread threads
5376 * among the available processor sets.
5377 * NRG this seems like the wrong thing to do.
5378 * See also task->pset_hint = pset in thread_setrun()
5379 */
5380 pset = get_threadtask(thread)->pset_hint;
5381 if (pset == PROCESSOR_SET_NULL) {
5382 pset = current_processor()->processor_set;
5383 }
5384
5385 pset = choose_next_pset(pset);
5386 }
5387
5388 if (!bit_test(node->pset_map, pset->pset_id)) {
5389 /* pset is not from this node so choose one that is */
5390 int id = lsb_first(node->pset_map);
5391 if (id < 0) {
5392 /* startup race, so check again under the node lock */
5393 lck_spin_lock(&pset_node_lock);
5394 if (bit_test(node->pset_map, pset->pset_id)) {
5395 id = pset->pset_id;
5396 } else {
5397 id = lsb_first(node->pset_map);
5398 }
5399 lck_spin_unlock(&pset_node_lock);
5400 }
5401 assert(id >= 0);
5402 pset = pset_array[id];
5403 }
5404
5405 if (bit_count(node->pset_map) == 1) {
5406 /* Only a single pset in this node */
5407 goto out;
5408 }
5409
5410 bool avoid_cpu0 = false;
5411
5412 #if defined(__x86_64__)
5413 if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5414 /* Avoid the pset containing cpu0 */
5415 avoid_cpu0 = true;
5416 /* Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ */
5417 assert(bit_test(pset_array[0]->cpu_bitmask, 0));
5418 }
5419 #endif
5420
5421 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5422 pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5423 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5424 if (avoid_cpu0) {
5425 rt_target_map = bit_ror64(rt_target_map, 1);
5426 }
5427 int rotid = lsb_first(rt_target_map);
5428 if (rotid >= 0) {
5429 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5430 pset = pset_array[id];
5431 goto out;
5432 }
5433 }
5434 if (!pset->is_SMT || !sched_allow_rt_smt) {
5435 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5436 goto out;
5437 }
5438 rt_target_map = atomic_load(&node->pset_non_rt_map);
5439 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5440 if (avoid_cpu0) {
5441 rt_target_map = bit_ror64(rt_target_map, 1);
5442 }
5443 int rotid = lsb_first(rt_target_map);
5444 if (rotid >= 0) {
5445 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5446 pset = pset_array[id];
5447 goto out;
5448 }
5449 }
5450 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5451 } else {
5452 pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5453 if (!bit_test(idle_map, pset->pset_id)) {
5454 int next_idle_pset_id = lsb_first(idle_map);
5455 if (next_idle_pset_id >= 0) {
5456 pset = pset_array[next_idle_pset_id];
5457 }
5458 }
5459 }
5460
5461 out:
5462 if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5463 processor = PROCESSOR_NULL;
5464 }
5465 if (processor != PROCESSOR_NULL) {
5466 *processor_hint = processor;
5467 }
5468
5469 assert(pset != NULL);
5470 return pset;
5471 }
5472
5473 /*
5474 * thread_setrun:
5475 *
5476 * Dispatch thread for execution, onto an idle
5477 * processor or run queue, and signal a preemption
5478 * as appropriate.
5479 *
5480 * Thread must be locked.
5481 */
5482 void
thread_setrun(thread_t thread,sched_options_t options)5483 thread_setrun(
5484 thread_t thread,
5485 sched_options_t options)
5486 {
5487 processor_t processor = PROCESSOR_NULL;
5488 processor_set_t pset;
5489
5490 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5491 assert(thread->runq == PROCESSOR_NULL);
5492
5493 #if CONFIG_PREADOPT_TG
5494 /* We know that the thread is not in the runq by virtue of being in this
5495 * function and the thread is not self since we are running. We can safely
5496 * resolve the thread group hierarchy and modify the thread's thread group
5497 * here. */
5498 thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
5499 #endif
5500
5501 /*
5502 * Update priority if needed.
5503 */
5504 if (SCHED(can_update_priority)(thread)) {
5505 SCHED(update_priority)(thread);
5506 }
5507 thread->sfi_class = sfi_thread_classify(thread);
5508
5509 if (thread->bound_processor == PROCESSOR_NULL) {
5510 /*
5511 * Unbound case.
5512 *
5513 * Usually, this loop will only be executed once,
5514 * but if CLPC derecommends a processor after it has been chosen,
5515 * or if a processor is shut down after it is chosen,
5516 * choose_processor() may return NULL, so a retry
5517 * may be necessary. A single retry will usually
5518 * be enough, and we can't afford to retry too many times
5519 * because interrupts are disabled.
5520 */
5521 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
5522 for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5523 processor_t processor_hint = PROCESSOR_NULL;
5524 pset_node_t node = SCHED(choose_node)(thread);
5525 processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
5526
5527 pset_lock(starting_pset);
5528
5529 processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5530 if (processor != PROCESSOR_NULL) {
5531 pset = processor->processor_set;
5532 pset_assert_locked(pset);
5533 break;
5534 }
5535 }
5536 /*
5537 * If choose_processor() still returns NULL,
5538 * which is very unlikely,
5539 * choose the master_processor, which is always
5540 * safe to choose.
5541 */
5542 if (processor == PROCESSOR_NULL) {
5543 /* Choose fallback processor */
5544 processor = master_processor;
5545 pset = processor->processor_set;
5546 pset_lock(pset);
5547 }
5548 task_t task = get_threadtask(thread);
5549 if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5550 task->pset_hint = pset; /* NRG this is done without holding the task lock */
5551 }
5552 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5553 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5554 } else {
5555 /*
5556 * Bound case:
5557 *
5558 * Unconditionally dispatch on the processor.
5559 */
5560 processor = thread->bound_processor;
5561 pset = processor->processor_set;
5562 pset_lock(pset);
5563
5564 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5565 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5566 }
5567
5568 /*
5569 * Dispatch the thread on the chosen processor.
5570 * TODO: This should be based on sched_mode, not sched_pri
5571 */
5572 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5573 realtime_setrun(processor, thread);
5574 } else {
5575 processor_setrun(processor, thread, options);
5576 }
5577 /* pset is now unlocked */
5578 if (thread->bound_processor == PROCESSOR_NULL) {
5579 SCHED(check_spill)(pset, thread);
5580 }
5581 }
5582
5583 processor_set_t
task_choose_pset(task_t task)5584 task_choose_pset(
5585 task_t task)
5586 {
5587 processor_set_t pset = task->pset_hint;
5588
5589 if (pset != PROCESSOR_SET_NULL) {
5590 pset = choose_next_pset(pset);
5591 }
5592
5593 return pset;
5594 }
5595
5596 /*
5597 * Check for a preemption point in
5598 * the current context.
5599 *
5600 * Called at splsched with thread locked.
5601 */
5602 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)5603 csw_check(
5604 thread_t thread,
5605 processor_t processor,
5606 ast_t check_reason)
5607 {
5608 processor_set_t pset = processor->processor_set;
5609
5610 assert(thread == processor->active_thread);
5611
5612 pset_lock(pset);
5613
5614 processor_state_update_from_thread(processor, thread, true);
5615
5616 ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5617
5618 /* Acknowledge the IPI if we decided not to preempt */
5619
5620 if ((preempt & AST_URGENT) == 0) {
5621 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5622 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5623 }
5624 }
5625
5626 if ((preempt & AST_PREEMPT) == 0) {
5627 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5628 }
5629
5630 pset_unlock(pset);
5631
5632 return preempt;
5633 }
5634
5635 /*
5636 * Check for preemption at splsched with
5637 * pset and thread locked
5638 */
5639 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)5640 csw_check_locked(
5641 thread_t thread,
5642 processor_t processor,
5643 processor_set_t pset,
5644 ast_t check_reason)
5645 {
5646 /*
5647 * If the current thread is running on a processor that is no longer recommended,
5648 * urgently preempt it, at which point thread_select() should
5649 * try to idle the processor and re-dispatch the thread to a recommended processor.
5650 */
5651 if (!processor->is_recommended) {
5652 return check_reason | AST_PREEMPT | AST_URGENT;
5653 }
5654
5655 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5656 return check_reason | AST_PREEMPT | AST_URGENT;
5657 }
5658
5659 if (rt_runq_count(pset) > 0) {
5660 if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5661 return check_reason | AST_PREEMPT | AST_URGENT;
5662 } else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) {
5663 return check_reason | AST_PREEMPT | AST_URGENT;
5664 } else {
5665 return check_reason | AST_PREEMPT;
5666 }
5667 }
5668
5669 ast_t result = SCHED(processor_csw_check)(processor);
5670 if (result != AST_NONE) {
5671 return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
5672 }
5673
5674 /*
5675 * Same for avoid-processor
5676 *
5677 * TODO: Should these set AST_REBALANCE?
5678 */
5679 if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)) {
5680 return check_reason | AST_PREEMPT;
5681 }
5682
5683 /*
5684 * Even though we could continue executing on this processor, a
5685 * secondary SMT core should try to shed load to another primary core.
5686 *
5687 * TODO: Should this do the same check that thread_select does? i.e.
5688 * if no bound threads target this processor, and idle primaries exist, preempt
5689 * The case of RT threads existing is already taken care of above
5690 */
5691
5692 if (processor->current_pri < BASEPRI_RTQUEUES &&
5693 processor->processor_primary != processor) {
5694 return check_reason | AST_PREEMPT;
5695 }
5696
5697 if (thread->state & TH_SUSP) {
5698 return check_reason | AST_PREEMPT;
5699 }
5700
5701 #if CONFIG_SCHED_SFI
5702 /*
5703 * Current thread may not need to be preempted, but maybe needs
5704 * an SFI wait?
5705 */
5706 result = sfi_thread_needs_ast(thread, NULL);
5707 if (result != AST_NONE) {
5708 return check_reason | result;
5709 }
5710 #endif
5711
5712 return AST_NONE;
5713 }
5714
5715 static void
ast_ack_if_needed(processor_t processor)5716 ast_ack_if_needed(processor_t processor)
5717 {
5718 struct ast_gen_pair *pair = PERCPU_GET_RELATIVE(ast_gen_pair, processor, processor);
5719 ast_gen_t gen;
5720
5721 /*
5722 * Make sure that if we observe a new generation, we ack it.
5723 *
5724 * Note that this ack might ack for a cause_ast_check()
5725 * that hasn't happened yet: 2 different cores A and B could
5726 * have called ast_generation_get(), we observe B's generation
5727 * already, before B has had a chance to call cause_ast_check() yet.
5728 *
5729 * This still preserves the property that we want,
5730 * which is that `processor` has been in ast_check()
5731 * _after_ ast_generation_get() was called.
5732 */
5733
5734 gen = os_atomic_load(&pair->ast_gen, relaxed);
5735 if (gen != os_atomic_load(&pair->ast_ack, relaxed)) {
5736 /* pairs with the fence in ast_generation_get() */
5737 os_atomic_thread_fence(acq_rel);
5738 os_atomic_store(&pair->ast_ack, gen, relaxed);
5739 }
5740 }
5741
5742 /*
5743 * Handle preemption IPI or IPI in response to setting an AST flag
5744 * Triggered by cause_ast_check
5745 * Called at splsched
5746 */
5747 void
ast_check(processor_t processor)5748 ast_check(processor_t processor)
5749 {
5750 if (processor->state != PROCESSOR_RUNNING &&
5751 processor->state != PROCESSOR_SHUTDOWN) {
5752 ast_ack_if_needed(processor);
5753 return;
5754 }
5755
5756 thread_t thread = processor->active_thread;
5757
5758 assert(thread == current_thread());
5759
5760 thread_lock(thread);
5761
5762 ast_ack_if_needed(processor);
5763 /*
5764 * Propagate thread ast to processor.
5765 * (handles IPI in response to setting AST flag)
5766 */
5767 ast_propagate(thread);
5768
5769 /*
5770 * Stash the old urgency and perfctl values to find out if
5771 * csw_check updates them.
5772 */
5773 thread_urgency_t old_urgency = processor->current_urgency;
5774 perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
5775
5776 ast_t preempt;
5777
5778 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5779 ast_on(preempt);
5780 }
5781
5782 if (old_urgency != processor->current_urgency) {
5783 /*
5784 * Urgency updates happen with the thread lock held (ugh).
5785 * TODO: This doesn't notice QoS changes...
5786 */
5787 uint64_t urgency_param1, urgency_param2;
5788
5789 thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
5790 thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
5791 }
5792
5793 thread_unlock(thread);
5794
5795 if (old_perfctl_class != processor->current_perfctl_class) {
5796 /*
5797 * We updated the perfctl class of this thread from another core.
5798 * Let CLPC know that the currently running thread has a new
5799 * class.
5800 */
5801
5802 machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
5803 mach_approximate_time(), 0, thread);
5804 }
5805 }
5806
5807 void
ast_generation_get(processor_t processor,ast_gen_t gens[])5808 ast_generation_get(processor_t processor, ast_gen_t gens[])
5809 {
5810 struct ast_gen_pair *pair = PERCPU_GET_RELATIVE(ast_gen_pair, processor, processor);
5811
5812 gens[processor->cpu_id] = os_atomic_add(&pair->ast_gen, 2, release);
5813 }
5814
5815 void
ast_generation_wait(ast_gen_t gens[MAX_CPUS])5816 ast_generation_wait(ast_gen_t gens[MAX_CPUS])
5817 {
5818 percpu_foreach(cpup, processor) {
5819 struct ast_gen_pair *pair;
5820 ast_gen_t gen_ack;
5821 uint32_t cpu = cpup->cpu_id;
5822
5823 if (gens[cpu] == 0) {
5824 continue;
5825 }
5826 pair = PERCPU_GET_RELATIVE(ast_gen_pair, processor, cpup);
5827 gen_ack = os_atomic_load(&pair->ast_ack, relaxed);
5828 while (__improbable(AST_GEN_CMP(gen_ack, <, gens[cpu]))) {
5829 disable_preemption();
5830 gen_ack = hw_wait_while_equals_long(&pair->ast_ack, gen_ack);
5831 enable_preemption();
5832 }
5833 }
5834 }
5835
5836
5837 /*
5838 * set_sched_pri:
5839 *
5840 * Set the scheduled priority of the specified thread.
5841 *
5842 * This may cause the thread to change queues.
5843 *
5844 * Thread must be locked.
5845 */
5846 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)5847 set_sched_pri(
5848 thread_t thread,
5849 int16_t new_priority,
5850 set_sched_pri_options_t options)
5851 {
5852 bool is_current_thread = (thread == current_thread());
5853 bool removed_from_runq = false;
5854 bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
5855
5856 int16_t old_priority = thread->sched_pri;
5857
5858 /* If we're already at this priority, no need to mess with the runqueue */
5859 if (new_priority == old_priority) {
5860 #if CONFIG_SCHED_CLUTCH
5861 /* For the first thread in the system, the priority is correct but
5862 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
5863 * scheduler relies on the bucket being set for all threads, update
5864 * its bucket here.
5865 */
5866 if (thread->th_sched_bucket == TH_BUCKET_RUN) {
5867 assert(thread == vm_pageout_scan_thread);
5868 SCHED(update_thread_bucket)(thread);
5869 }
5870 #endif /* CONFIG_SCHED_CLUTCH */
5871
5872 return;
5873 }
5874
5875 if (is_current_thread) {
5876 assert(thread->state & TH_RUN);
5877 assert(thread->runq == PROCESSOR_NULL);
5878 } else {
5879 removed_from_runq = thread_run_queue_remove(thread);
5880 }
5881
5882 thread->sched_pri = new_priority;
5883
5884 #if CONFIG_SCHED_CLUTCH
5885 /*
5886 * Since for the clutch scheduler, the thread's bucket determines its runq
5887 * in the hierarchy it is important to update the bucket when the thread
5888 * lock is held and the thread has been removed from the runq hierarchy.
5889 */
5890 SCHED(update_thread_bucket)(thread);
5891
5892 #endif /* CONFIG_SCHED_CLUTCH */
5893
5894 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
5895 (uintptr_t)thread_tid(thread),
5896 thread->base_pri,
5897 thread->sched_pri,
5898 thread->sched_usage,
5899 0);
5900
5901 if (removed_from_runq) {
5902 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
5903 } else if (is_current_thread) {
5904 processor_t processor = thread->last_processor;
5905 assert(processor == current_processor());
5906
5907 thread_urgency_t old_urgency = processor->current_urgency;
5908
5909 /*
5910 * When dropping in priority, check if the thread no longer belongs on core.
5911 * If a thread raises its own priority, don't aggressively rebalance it.
5912 * <rdar://problem/31699165>
5913 *
5914 * csw_check does a processor_state_update_from_thread, but
5915 * we should do our own if we're being lazy.
5916 */
5917 if (!lazy_update && new_priority < old_priority) {
5918 ast_t preempt;
5919
5920 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5921 ast_on(preempt);
5922 }
5923 } else {
5924 processor_state_update_from_thread(processor, thread, false);
5925 }
5926
5927 /*
5928 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
5929 * class alterations from user space to occur relatively infrequently, hence
5930 * those are lazily handled. QoS classes have distinct priority bands, and QoS
5931 * inheritance is expected to involve priority changes.
5932 */
5933 if (processor->current_urgency != old_urgency) {
5934 uint64_t urgency_param1, urgency_param2;
5935
5936 thread_urgency_t new_urgency = thread_get_urgency(thread,
5937 &urgency_param1, &urgency_param2);
5938
5939 thread_tell_urgency(new_urgency, urgency_param1,
5940 urgency_param2, 0, thread);
5941 }
5942
5943 /* TODO: only call this if current_perfctl_class changed */
5944 uint64_t ctime = mach_approximate_time();
5945 machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
5946 } else if (thread->state & TH_RUN) {
5947 processor_t processor = thread->last_processor;
5948
5949 if (!lazy_update &&
5950 processor != PROCESSOR_NULL &&
5951 processor != current_processor() &&
5952 processor->active_thread == thread) {
5953 cause_ast_check(processor);
5954 }
5955 }
5956 }
5957
5958 /*
5959 * thread_run_queue_remove_for_handoff
5960 *
5961 * Pull a thread or its (recursive) push target out of the runqueue
5962 * so that it is ready for thread_run()
5963 *
5964 * Called at splsched
5965 *
5966 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
5967 * This may be different than the thread that was passed in.
5968 */
5969 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)5970 thread_run_queue_remove_for_handoff(thread_t thread)
5971 {
5972 thread_t pulled_thread = THREAD_NULL;
5973
5974 thread_lock(thread);
5975
5976 /*
5977 * Check that the thread is not bound to a different processor,
5978 * NO_SMT flag is not set on the thread, cluster type of
5979 * processor matches with thread if the thread is pinned to a
5980 * particular cluster and that realtime is not involved.
5981 *
5982 * Next, pull it off its run queue. If it doesn't come, it's not eligible.
5983 */
5984 processor_t processor = current_processor();
5985 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
5986 && (!thread_no_smt(thread))
5987 && (processor->current_pri < BASEPRI_RTQUEUES)
5988 && (thread->sched_pri < BASEPRI_RTQUEUES)
5989 #if __AMP__
5990 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
5991 processor->processor_set->pset_id == thread->th_bound_cluster_id)
5992 #endif /* __AMP__ */
5993 ) {
5994 if (thread_run_queue_remove(thread)) {
5995 pulled_thread = thread;
5996 }
5997 }
5998
5999 thread_unlock(thread);
6000
6001 return pulled_thread;
6002 }
6003
6004 /*
6005 * thread_prepare_for_handoff
6006 *
6007 * Make the thread ready for handoff.
6008 * If the thread was runnable then pull it off the runq, if the thread could
6009 * not be pulled, return NULL.
6010 *
6011 * If the thread was woken up from wait for handoff, make sure it is not bound to
6012 * different processor.
6013 *
6014 * Called at splsched
6015 *
6016 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6017 * This may be different than the thread that was passed in.
6018 */
6019 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)6020 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6021 {
6022 thread_t pulled_thread = THREAD_NULL;
6023
6024 if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6025 processor_t processor = current_processor();
6026 thread_lock(thread);
6027
6028 /*
6029 * Check that the thread is not bound to a different processor,
6030 * NO_SMT flag is not set on the thread and cluster type of
6031 * processor matches with thread if the thread is pinned to a
6032 * particular cluster. Call setrun instead if above conditions
6033 * are not satisfied.
6034 */
6035 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6036 && (!thread_no_smt(thread))
6037 #if __AMP__
6038 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6039 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6040 #endif /* __AMP__ */
6041 ) {
6042 pulled_thread = thread;
6043 } else {
6044 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
6045 }
6046 thread_unlock(thread);
6047 } else {
6048 pulled_thread = thread_run_queue_remove_for_handoff(thread);
6049 }
6050
6051 return pulled_thread;
6052 }
6053
6054 /*
6055 * thread_run_queue_remove:
6056 *
6057 * Remove a thread from its current run queue and
6058 * return TRUE if successful.
6059 *
6060 * Thread must be locked.
6061 *
6062 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6063 * run queues because the caller locked the thread. Otherwise
6064 * the thread is on a run queue, but could be chosen for dispatch
6065 * and removed by another processor under a different lock, which
6066 * will set thread->runq to PROCESSOR_NULL.
6067 *
6068 * Hence the thread select path must not rely on anything that could
6069 * be changed under the thread lock after calling this function,
6070 * most importantly thread->sched_pri.
6071 */
6072 boolean_t
thread_run_queue_remove(thread_t thread)6073 thread_run_queue_remove(
6074 thread_t thread)
6075 {
6076 boolean_t removed = FALSE;
6077 processor_t processor = thread->runq;
6078
6079 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6080 /* Thread isn't runnable */
6081 assert(thread->runq == PROCESSOR_NULL);
6082 return FALSE;
6083 }
6084
6085 if (processor == PROCESSOR_NULL) {
6086 /*
6087 * The thread is either not on the runq,
6088 * or is in the midst of being removed from the runq.
6089 *
6090 * runq is set to NULL under the pset lock, not the thread
6091 * lock, so the thread may still be in the process of being dequeued
6092 * from the runq. It will wait in invoke for the thread lock to be
6093 * dropped.
6094 */
6095
6096 return FALSE;
6097 }
6098
6099 if (thread->sched_pri < BASEPRI_RTQUEUES) {
6100 return SCHED(processor_queue_remove)(processor, thread);
6101 }
6102
6103 processor_set_t pset = processor->processor_set;
6104
6105 pset_lock(pset);
6106
6107 if (thread->runq != PROCESSOR_NULL) {
6108 /*
6109 * Thread is on the RT run queue and we have a lock on
6110 * that run queue.
6111 */
6112 rt_runq_remove(SCHED(rt_runq)(pset), thread);
6113 pset_update_rt_stealable_state(pset);
6114
6115 removed = TRUE;
6116 }
6117
6118 pset_unlock(pset);
6119
6120 return removed;
6121 }
6122
6123 /*
6124 * Put the thread back where it goes after a thread_run_queue_remove
6125 *
6126 * Thread must have been removed under the same thread lock hold
6127 *
6128 * thread locked, at splsched
6129 */
6130 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)6131 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6132 {
6133 assert(thread->runq == PROCESSOR_NULL);
6134 assert(thread->state & (TH_RUN));
6135
6136 thread_setrun(thread, options);
6137 }
6138
6139 void
sys_override_cpu_throttle(boolean_t enable_override)6140 sys_override_cpu_throttle(boolean_t enable_override)
6141 {
6142 if (enable_override) {
6143 cpu_throttle_enabled = 0;
6144 } else {
6145 cpu_throttle_enabled = 1;
6146 }
6147 }
6148
6149 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)6150 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6151 {
6152 uint64_t urgency_param1 = 0, urgency_param2 = 0;
6153 task_t task = get_threadtask_early(thread);
6154
6155 thread_urgency_t urgency;
6156
6157 if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6158 urgency_param1 = 0;
6159 urgency_param2 = 0;
6160
6161 urgency = THREAD_URGENCY_NONE;
6162 } else if (thread->sched_mode == TH_MODE_REALTIME) {
6163 urgency_param1 = thread->realtime.period;
6164 urgency_param2 = thread->realtime.deadline;
6165
6166 urgency = THREAD_URGENCY_REAL_TIME;
6167 } else if (cpu_throttle_enabled &&
6168 (thread->sched_pri <= MAXPRI_THROTTLE) &&
6169 (thread->base_pri <= MAXPRI_THROTTLE)) {
6170 /*
6171 * Threads that are running at low priority but are not
6172 * tagged with a specific QoS are separated out from
6173 * the "background" urgency. Performance management
6174 * subsystem can decide to either treat these threads
6175 * as normal threads or look at other signals like thermal
6176 * levels for optimal power/perf tradeoffs for a platform.
6177 */
6178 boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6179 boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6180
6181 /*
6182 * Background urgency applied when thread priority is
6183 * MAXPRI_THROTTLE or lower and thread is not promoted
6184 * and thread has a QoS specified
6185 */
6186 urgency_param1 = thread->sched_pri;
6187 urgency_param2 = thread->base_pri;
6188
6189 if (thread_lacks_qos && !task_is_suppressed) {
6190 urgency = THREAD_URGENCY_LOWPRI;
6191 } else {
6192 urgency = THREAD_URGENCY_BACKGROUND;
6193 }
6194 } else {
6195 /* For otherwise unclassified threads, report throughput QoS parameters */
6196 urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6197 urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6198 urgency = THREAD_URGENCY_NORMAL;
6199 }
6200
6201 if (arg1 != NULL) {
6202 *arg1 = urgency_param1;
6203 }
6204 if (arg2 != NULL) {
6205 *arg2 = urgency_param2;
6206 }
6207
6208 return urgency;
6209 }
6210
6211 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)6212 thread_get_perfcontrol_class(thread_t thread)
6213 {
6214 /* Special case handling */
6215 if (thread->state & TH_IDLE) {
6216 return PERFCONTROL_CLASS_IDLE;
6217 }
6218
6219 if (thread->sched_mode == TH_MODE_REALTIME) {
6220 return PERFCONTROL_CLASS_REALTIME;
6221 }
6222
6223 /* perfcontrol_class based on base_pri */
6224 if (thread->base_pri <= MAXPRI_THROTTLE) {
6225 return PERFCONTROL_CLASS_BACKGROUND;
6226 } else if (thread->base_pri <= BASEPRI_UTILITY) {
6227 return PERFCONTROL_CLASS_UTILITY;
6228 } else if (thread->base_pri <= BASEPRI_DEFAULT) {
6229 return PERFCONTROL_CLASS_NONUI;
6230 } else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6231 return PERFCONTROL_CLASS_UI;
6232 } else {
6233 if (get_threadtask(thread) == kernel_task) {
6234 /*
6235 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6236 * All other lower priority kernel threads should be treated
6237 * as regular threads for performance control purposes.
6238 */
6239 return PERFCONTROL_CLASS_KERNEL;
6240 }
6241 return PERFCONTROL_CLASS_ABOVEUI;
6242 }
6243 }
6244
6245 /*
6246 * This is the processor idle loop, which just looks for other threads
6247 * to execute. Processor idle threads invoke this without supplying a
6248 * current thread to idle without an asserted wait state.
6249 *
6250 * Returns a the next thread to execute if dispatched directly.
6251 */
6252
6253 #if 0
6254 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6255 #else
6256 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6257 #endif
6258
6259 #if (DEVELOPMENT || DEBUG)
6260 int sched_idle_delay_cpuid = -1;
6261 #endif
6262
6263 thread_t
processor_idle(thread_t thread,processor_t processor)6264 processor_idle(
6265 thread_t thread,
6266 processor_t processor)
6267 {
6268 processor_set_t pset = processor->processor_set;
6269
6270 (void)splsched();
6271
6272 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6273 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6274 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6275
6276 SCHED_STATS_INC(idle_transitions);
6277 assert(processor->running_timers_active == false);
6278
6279 uint64_t ctime = mach_absolute_time();
6280
6281 timer_switch(&processor->system_state, ctime, &processor->idle_state);
6282 processor->current_state = &processor->idle_state;
6283
6284 cpu_quiescent_counter_leave(ctime);
6285
6286 while (1) {
6287 /*
6288 * Ensure that updates to my processor and pset state,
6289 * made by the IPI source processor before sending the IPI,
6290 * are visible on this processor now (even though we don't
6291 * take the pset lock yet).
6292 */
6293 atomic_thread_fence(memory_order_acquire);
6294
6295 if (processor->state != PROCESSOR_IDLE) {
6296 break;
6297 }
6298 if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6299 break;
6300 }
6301 #if defined(CONFIG_SCHED_DEFERRED_AST)
6302 if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6303 break;
6304 }
6305 #endif
6306 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6307 break;
6308 }
6309
6310 if (processor->is_recommended && (processor->processor_primary == processor)) {
6311 if (rt_runq_count(pset)) {
6312 break;
6313 }
6314 } else {
6315 if (SCHED(processor_bound_count)(processor)) {
6316 break;
6317 }
6318 }
6319
6320 IDLE_KERNEL_DEBUG_CONSTANT(
6321 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6322
6323 machine_track_platform_idle(TRUE);
6324
6325 machine_idle();
6326 /* returns with interrupts enabled */
6327
6328 machine_track_platform_idle(FALSE);
6329
6330 #if (DEVELOPMENT || DEBUG)
6331 if (processor->cpu_id == sched_idle_delay_cpuid) {
6332 delay(500);
6333 }
6334 #endif
6335
6336 (void)splsched();
6337
6338 atomic_thread_fence(memory_order_acquire);
6339
6340 IDLE_KERNEL_DEBUG_CONSTANT(
6341 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6342
6343 /*
6344 * Check if we should call sched_timeshare_consider_maintenance() here.
6345 * The CPU was woken out of idle due to an interrupt and we should do the
6346 * call only if the processor is still idle. If the processor is non-idle,
6347 * the threads running on the processor would do the call as part of
6348 * context swithing.
6349 */
6350 if (processor->state == PROCESSOR_IDLE) {
6351 sched_timeshare_consider_maintenance(mach_absolute_time());
6352 }
6353
6354 if (!SCHED(processor_queue_empty)(processor)) {
6355 /* Secondary SMT processors respond to directed wakeups
6356 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6357 */
6358 if (processor->processor_primary == processor) {
6359 break;
6360 }
6361 }
6362 }
6363
6364 ctime = mach_absolute_time();
6365
6366 timer_switch(&processor->idle_state, ctime, &processor->system_state);
6367 processor->current_state = &processor->system_state;
6368
6369 cpu_quiescent_counter_join(ctime);
6370
6371 ast_t reason = AST_NONE;
6372
6373 /* We're handling all scheduling AST's */
6374 ast_off(AST_SCHEDULING);
6375
6376 /*
6377 * thread_select will move the processor from dispatching to running,
6378 * or put it in idle if there's nothing to do.
6379 */
6380 thread_t cur_thread = current_thread();
6381
6382 thread_lock(cur_thread);
6383 thread_t new_thread = thread_select(cur_thread, processor, &reason);
6384 thread_unlock(cur_thread);
6385
6386 assert(processor->running_timers_active == false);
6387
6388 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6389 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6390 (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6391
6392 return new_thread;
6393 }
6394
6395 /*
6396 * Each processor has a dedicated thread which
6397 * executes the idle loop when there is no suitable
6398 * previous context.
6399 *
6400 * This continuation is entered with interrupts disabled.
6401 */
6402 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)6403 idle_thread(__assert_only void* parameter,
6404 __unused wait_result_t result)
6405 {
6406 assert(ml_get_interrupts_enabled() == FALSE);
6407 assert(parameter == NULL);
6408
6409 processor_t processor = current_processor();
6410
6411 /*
6412 * Ensure that anything running in idle context triggers
6413 * preemption-disabled checks.
6414 */
6415 disable_preemption_without_measurements();
6416
6417 /*
6418 * Enable interrupts temporarily to handle any pending interrupts
6419 * or IPIs before deciding to sleep
6420 */
6421 spllo();
6422
6423 thread_t new_thread = processor_idle(THREAD_NULL, processor);
6424 /* returns with interrupts disabled */
6425
6426 enable_preemption();
6427
6428 if (new_thread != THREAD_NULL) {
6429 thread_run(processor->idle_thread,
6430 idle_thread, NULL, new_thread);
6431 /*NOTREACHED*/
6432 }
6433
6434 thread_block(idle_thread);
6435 /*NOTREACHED*/
6436 }
6437
6438 kern_return_t
idle_thread_create(processor_t processor)6439 idle_thread_create(
6440 processor_t processor)
6441 {
6442 kern_return_t result;
6443 thread_t thread;
6444 spl_t s;
6445 char name[MAXTHREADNAMESIZE];
6446
6447 result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
6448 if (result != KERN_SUCCESS) {
6449 return result;
6450 }
6451
6452 snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6453 thread_set_thread_name(thread, name);
6454
6455 s = splsched();
6456 thread_lock(thread);
6457 thread->bound_processor = processor;
6458 processor->idle_thread = thread;
6459 thread->sched_pri = thread->base_pri = IDLEPRI;
6460 thread->state = (TH_RUN | TH_IDLE);
6461 thread->options |= TH_OPT_IDLE_THREAD;
6462 thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6463 thread_unlock(thread);
6464 splx(s);
6465
6466 thread_deallocate(thread);
6467
6468 return KERN_SUCCESS;
6469 }
6470
6471 /*
6472 * sched_startup:
6473 *
6474 * Kicks off scheduler services.
6475 *
6476 * Called at splsched.
6477 */
6478 void
sched_startup(void)6479 sched_startup(void)
6480 {
6481 kern_return_t result;
6482 thread_t thread;
6483
6484 simple_lock_init(&sched_vm_group_list_lock, 0);
6485
6486 #if __arm__ || __arm64__
6487 simple_lock_init(&sched_recommended_cores_lock, 0);
6488 #endif /* __arm__ || __arm64__ */
6489
6490 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
6491 NULL, MAXPRI_KERNEL, &thread);
6492 if (result != KERN_SUCCESS) {
6493 panic("sched_startup");
6494 }
6495
6496 thread_deallocate(thread);
6497
6498 assert_thread_magic(thread);
6499
6500 /*
6501 * Yield to the sched_init_thread once, to
6502 * initialize our own thread after being switched
6503 * back to.
6504 *
6505 * The current thread is the only other thread
6506 * active at this point.
6507 */
6508 thread_block(THREAD_CONTINUE_NULL);
6509 }
6510
6511 #if __arm64__
6512 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6513 #endif /* __arm64__ */
6514
6515
6516 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6517
6518 static volatile uint64_t sched_maintenance_deadline;
6519 static uint64_t sched_tick_last_abstime;
6520 static uint64_t sched_tick_delta;
6521 uint64_t sched_tick_max_delta;
6522
6523
6524 /*
6525 * sched_init_thread:
6526 *
6527 * Perform periodic bookkeeping functions about ten
6528 * times per second.
6529 */
6530 void
sched_timeshare_maintenance_continue(void)6531 sched_timeshare_maintenance_continue(void)
6532 {
6533 uint64_t sched_tick_ctime, late_time;
6534
6535 struct sched_update_scan_context scan_context = {
6536 .earliest_bg_make_runnable_time = UINT64_MAX,
6537 .earliest_normal_make_runnable_time = UINT64_MAX,
6538 .earliest_rt_make_runnable_time = UINT64_MAX
6539 };
6540
6541 sched_tick_ctime = mach_absolute_time();
6542
6543 if (__improbable(sched_tick_last_abstime == 0)) {
6544 sched_tick_last_abstime = sched_tick_ctime;
6545 late_time = 0;
6546 sched_tick_delta = 1;
6547 } else {
6548 late_time = sched_tick_ctime - sched_tick_last_abstime;
6549 sched_tick_delta = late_time / sched_tick_interval;
6550 /* Ensure a delta of 1, since the interval could be slightly
6551 * smaller than the sched_tick_interval due to dispatch
6552 * latencies.
6553 */
6554 sched_tick_delta = MAX(sched_tick_delta, 1);
6555
6556 /* In the event interrupt latencies or platform
6557 * idle events that advanced the timebase resulted
6558 * in periods where no threads were dispatched,
6559 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6560 * iterations.
6561 */
6562 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6563
6564 sched_tick_last_abstime = sched_tick_ctime;
6565 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6566 }
6567
6568 scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6569 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6570 sched_tick_delta, late_time, 0, 0, 0);
6571
6572 /* Add a number of pseudo-ticks corresponding to the elapsed interval
6573 * This could be greater than 1 if substantial intervals where
6574 * all processors are idle occur, which rarely occurs in practice.
6575 */
6576
6577 sched_tick += sched_tick_delta;
6578
6579 update_vm_info();
6580
6581 /*
6582 * Compute various averages.
6583 */
6584 compute_averages(sched_tick_delta);
6585
6586 /*
6587 * Scan the run queues for threads which
6588 * may need to be updated, and find the earliest runnable thread on the runqueue
6589 * to report its latency.
6590 */
6591 SCHED(thread_update_scan)(&scan_context);
6592
6593 SCHED(rt_runq_scan)(&scan_context);
6594
6595 uint64_t ctime = mach_absolute_time();
6596
6597 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6598 ctime - scan_context.earliest_bg_make_runnable_time : 0;
6599
6600 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6601 ctime - scan_context.earliest_normal_make_runnable_time : 0;
6602
6603 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6604 ctime - scan_context.earliest_rt_make_runnable_time : 0;
6605
6606 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6607
6608 /*
6609 * Check to see if the special sched VM group needs attention.
6610 */
6611 sched_vm_group_maintenance();
6612
6613 #if __arm__ || __arm64__
6614 /* Check to see if the recommended cores failsafe is active */
6615 sched_recommended_cores_maintenance();
6616 #endif /* __arm__ || __arm64__ */
6617
6618
6619 #if DEBUG || DEVELOPMENT
6620 #if __x86_64__
6621 #include <i386/misc_protos.h>
6622 /* Check for long-duration interrupts */
6623 mp_interrupt_watchdog();
6624 #endif /* __x86_64__ */
6625 #endif /* DEBUG || DEVELOPMENT */
6626
6627 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6628 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6629 sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6630
6631 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6632 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
6633 /*NOTREACHED*/
6634 }
6635
6636 static uint64_t sched_maintenance_wakeups;
6637
6638 /*
6639 * Determine if the set of routines formerly driven by a maintenance timer
6640 * must be invoked, based on a deadline comparison. Signals the scheduler
6641 * maintenance thread on deadline expiration. Must be invoked at an interval
6642 * lower than the "sched_tick_interval", currently accomplished by
6643 * invocation via the quantum expiration timer and at context switch time.
6644 * Performance matters: this routine reuses a timestamp approximating the
6645 * current absolute time received from the caller, and should perform
6646 * no more than a comparison against the deadline in the common case.
6647 */
6648 void
sched_timeshare_consider_maintenance(uint64_t ctime)6649 sched_timeshare_consider_maintenance(uint64_t ctime)
6650 {
6651 cpu_quiescent_counter_checkin(ctime);
6652
6653 uint64_t deadline = sched_maintenance_deadline;
6654
6655 if (__improbable(ctime >= deadline)) {
6656 if (__improbable(current_thread() == sched_maintenance_thread)) {
6657 return;
6658 }
6659 OSMemoryBarrier();
6660
6661 uint64_t ndeadline = ctime + sched_tick_interval;
6662
6663 if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6664 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
6665 sched_maintenance_wakeups++;
6666 }
6667 }
6668
6669 #if !CONFIG_SCHED_CLUTCH
6670 /*
6671 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
6672 * scheduler, the load is maintained at the thread group and bucket level.
6673 */
6674 uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
6675
6676 if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
6677 uint64_t new_deadline = 0;
6678 if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
6679 compute_sched_load();
6680 new_deadline = ctime + sched_load_compute_interval_abs;
6681 os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
6682 }
6683 }
6684 #endif /* CONFIG_SCHED_CLUTCH */
6685
6686 #if __arm64__
6687 uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
6688
6689 if (__improbable(perf_deadline && ctime >= perf_deadline)) {
6690 /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
6691 if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
6692 machine_perfcontrol_deadline_passed(perf_deadline);
6693 }
6694 }
6695 #endif /* __arm64__ */
6696 }
6697
6698 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6699
6700 void
sched_init_thread(void)6701 sched_init_thread(void)
6702 {
6703 thread_block(THREAD_CONTINUE_NULL);
6704
6705 thread_t thread = current_thread();
6706
6707 thread_set_thread_name(thread, "sched_maintenance_thread");
6708
6709 sched_maintenance_thread = thread;
6710
6711 SCHED(maintenance_continuation)();
6712
6713 /*NOTREACHED*/
6714 }
6715
6716 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6717
6718 /*
6719 * thread_update_scan / runq_scan:
6720 *
6721 * Scan the run queues to account for timesharing threads
6722 * which need to be updated.
6723 *
6724 * Scanner runs in two passes. Pass one squirrels likely
6725 * threads away in an array, pass two does the update.
6726 *
6727 * This is necessary because the run queue is locked for
6728 * the candidate scan, but the thread is locked for the update.
6729 *
6730 * Array should be sized to make forward progress, without
6731 * disabling preemption for long periods.
6732 */
6733
6734 #define THREAD_UPDATE_SIZE 128
6735
6736 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
6737 static uint32_t thread_update_count = 0;
6738
6739 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
6740 boolean_t
thread_update_add_thread(thread_t thread)6741 thread_update_add_thread(thread_t thread)
6742 {
6743 if (thread_update_count == THREAD_UPDATE_SIZE) {
6744 return FALSE;
6745 }
6746
6747 thread_update_array[thread_update_count++] = thread;
6748 thread_reference(thread);
6749 return TRUE;
6750 }
6751
6752 void
thread_update_process_threads(void)6753 thread_update_process_threads(void)
6754 {
6755 assert(thread_update_count <= THREAD_UPDATE_SIZE);
6756
6757 for (uint32_t i = 0; i < thread_update_count; i++) {
6758 thread_t thread = thread_update_array[i];
6759 assert_thread_magic(thread);
6760 thread_update_array[i] = THREAD_NULL;
6761
6762 spl_t s = splsched();
6763 thread_lock(thread);
6764 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
6765 SCHED(update_priority)(thread);
6766 }
6767 thread_unlock(thread);
6768 splx(s);
6769
6770 thread_deallocate(thread);
6771 }
6772
6773 thread_update_count = 0;
6774 }
6775
6776 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)6777 runq_scan_thread(
6778 thread_t thread,
6779 sched_update_scan_context_t scan_context)
6780 {
6781 assert_thread_magic(thread);
6782
6783 if (thread->sched_stamp != sched_tick &&
6784 thread->sched_mode == TH_MODE_TIMESHARE) {
6785 if (thread_update_add_thread(thread) == FALSE) {
6786 return TRUE;
6787 }
6788 }
6789
6790 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
6791 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
6792 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
6793 }
6794 } else {
6795 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
6796 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
6797 }
6798 }
6799
6800 return FALSE;
6801 }
6802
6803 /*
6804 * Scan a runq for candidate threads.
6805 *
6806 * Returns TRUE if retry is needed.
6807 */
6808 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)6809 runq_scan(
6810 run_queue_t runq,
6811 sched_update_scan_context_t scan_context)
6812 {
6813 int count = runq->count;
6814 int queue_index;
6815
6816 assert(count >= 0);
6817
6818 if (count == 0) {
6819 return FALSE;
6820 }
6821
6822 for (queue_index = bitmap_first(runq->bitmap, NRQS);
6823 queue_index >= 0;
6824 queue_index = bitmap_next(runq->bitmap, queue_index)) {
6825 thread_t thread;
6826 circle_queue_t queue = &runq->queues[queue_index];
6827
6828 cqe_foreach_element(thread, queue, runq_links) {
6829 assert(count > 0);
6830 if (runq_scan_thread(thread, scan_context) == TRUE) {
6831 return TRUE;
6832 }
6833 count--;
6834 }
6835 }
6836
6837 return FALSE;
6838 }
6839
6840 #if CONFIG_SCHED_CLUTCH
6841
6842 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)6843 sched_clutch_timeshare_scan(
6844 queue_t thread_queue,
6845 uint16_t thread_count,
6846 sched_update_scan_context_t scan_context)
6847 {
6848 if (thread_count == 0) {
6849 return FALSE;
6850 }
6851
6852 thread_t thread;
6853 qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
6854 if (runq_scan_thread(thread, scan_context) == TRUE) {
6855 return TRUE;
6856 }
6857 thread_count--;
6858 }
6859
6860 assert(thread_count == 0);
6861 return FALSE;
6862 }
6863
6864
6865 #endif /* CONFIG_SCHED_CLUTCH */
6866
6867 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6868
6869 bool
thread_is_eager_preempt(thread_t thread)6870 thread_is_eager_preempt(thread_t thread)
6871 {
6872 return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
6873 }
6874
6875 void
thread_set_eager_preempt(thread_t thread)6876 thread_set_eager_preempt(thread_t thread)
6877 {
6878 spl_t s = splsched();
6879 thread_lock(thread);
6880
6881 assert(!thread_is_eager_preempt(thread));
6882
6883 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
6884
6885 if (thread == current_thread()) {
6886 /* csw_check updates current_is_eagerpreempt on the processor */
6887 ast_t ast = csw_check(thread, current_processor(), AST_NONE);
6888
6889 thread_unlock(thread);
6890
6891 if (ast != AST_NONE) {
6892 thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
6893 }
6894 } else {
6895 processor_t last_processor = thread->last_processor;
6896
6897 if (last_processor != PROCESSOR_NULL &&
6898 last_processor->state == PROCESSOR_RUNNING &&
6899 last_processor->active_thread == thread) {
6900 cause_ast_check(last_processor);
6901 }
6902
6903 thread_unlock(thread);
6904 }
6905
6906 splx(s);
6907 }
6908
6909 void
thread_clear_eager_preempt(thread_t thread)6910 thread_clear_eager_preempt(thread_t thread)
6911 {
6912 spl_t s = splsched();
6913 thread_lock(thread);
6914
6915 assert(thread_is_eager_preempt(thread));
6916
6917 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
6918
6919 if (thread == current_thread()) {
6920 current_processor()->current_is_eagerpreempt = false;
6921 }
6922
6923 thread_unlock(thread);
6924 splx(s);
6925 }
6926
6927 /*
6928 * Scheduling statistics
6929 */
6930 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)6931 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
6932 {
6933 struct sched_statistics *stats;
6934 boolean_t to_realtime = FALSE;
6935
6936 stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
6937 stats->csw_count++;
6938
6939 if (otherpri >= BASEPRI_REALTIME) {
6940 stats->rt_sched_count++;
6941 to_realtime = TRUE;
6942 }
6943
6944 if ((reasons & AST_PREEMPT) != 0) {
6945 stats->preempt_count++;
6946
6947 if (selfpri >= BASEPRI_REALTIME) {
6948 stats->preempted_rt_count++;
6949 }
6950
6951 if (to_realtime) {
6952 stats->preempted_by_rt_count++;
6953 }
6954 }
6955 }
6956
6957 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)6958 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
6959 {
6960 uint64_t timestamp = mach_absolute_time();
6961
6962 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
6963 stats->last_change_timestamp = timestamp;
6964 }
6965
6966 /*
6967 * For calls from assembly code
6968 */
6969 #undef thread_wakeup
6970 void
6971 thread_wakeup(
6972 event_t x);
6973
6974 void
thread_wakeup(event_t x)6975 thread_wakeup(
6976 event_t x)
6977 {
6978 thread_wakeup_with_result(x, THREAD_AWAKENED);
6979 }
6980
6981 boolean_t
preemption_enabled(void)6982 preemption_enabled(void)
6983 {
6984 return get_preemption_level() == 0 && ml_get_interrupts_enabled();
6985 }
6986
6987 static void
sched_timer_deadline_tracking_init(void)6988 sched_timer_deadline_tracking_init(void)
6989 {
6990 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
6991 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
6992 }
6993
6994 #if __arm__ || __arm64__
6995
6996 uint32_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
6997 uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
6998 bool perfcontrol_failsafe_active = false;
6999 bool perfcontrol_sleep_override = false;
7000
7001 uint64_t perfcontrol_failsafe_maintenance_runnable_time;
7002 uint64_t perfcontrol_failsafe_activation_time;
7003 uint64_t perfcontrol_failsafe_deactivation_time;
7004
7005 /* data covering who likely caused it and how long they ran */
7006 #define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */
7007 char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
7008 int perfcontrol_failsafe_pid;
7009 uint64_t perfcontrol_failsafe_tid;
7010 uint64_t perfcontrol_failsafe_thread_timer_at_start;
7011 uint64_t perfcontrol_failsafe_thread_timer_last_seen;
7012 uint32_t perfcontrol_failsafe_recommended_at_trigger;
7013
7014 /*
7015 * Perf controller calls here to update the recommended core bitmask.
7016 * If the failsafe is active, we don't immediately apply the new value.
7017 * Instead, we store the new request and use it after the failsafe deactivates.
7018 *
7019 * If the failsafe is not active, immediately apply the update.
7020 *
7021 * No scheduler locks are held, no other locks are held that scheduler might depend on,
7022 * interrupts are enabled
7023 *
7024 * currently prototype is in osfmk/arm/machine_routines.h
7025 */
7026 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)7027 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
7028 {
7029 assert(preemption_enabled());
7030
7031 spl_t s = splsched();
7032 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7033
7034 perfcontrol_requested_recommended_cores = recommended_cores;
7035 perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores);
7036
7037 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7038 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7039 } else {
7040 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7041 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7042 perfcontrol_requested_recommended_cores,
7043 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7044 }
7045
7046 simple_unlock(&sched_recommended_cores_lock);
7047 splx(s);
7048 }
7049
7050 void
sched_override_recommended_cores_for_sleep(void)7051 sched_override_recommended_cores_for_sleep(void)
7052 {
7053 spl_t s = splsched();
7054 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7055
7056 if (perfcontrol_sleep_override == false) {
7057 perfcontrol_sleep_override = true;
7058 sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
7059 }
7060
7061 simple_unlock(&sched_recommended_cores_lock);
7062 splx(s);
7063 }
7064
7065 void
sched_restore_recommended_cores_after_sleep(void)7066 sched_restore_recommended_cores_after_sleep(void)
7067 {
7068 spl_t s = splsched();
7069 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7070
7071 if (perfcontrol_sleep_override == true) {
7072 perfcontrol_sleep_override = false;
7073 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7074 }
7075
7076 simple_unlock(&sched_recommended_cores_lock);
7077 splx(s);
7078 }
7079
7080 /*
7081 * Consider whether we need to activate the recommended cores failsafe
7082 *
7083 * Called from quantum timer interrupt context of a realtime thread
7084 * No scheduler locks are held, interrupts are disabled
7085 */
7086 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)7087 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7088 {
7089 /*
7090 * Check if a realtime thread is starving the system
7091 * and bringing up non-recommended cores would help
7092 *
7093 * TODO: Is this the correct check for recommended == possible cores?
7094 * TODO: Validate the checks without the relevant lock are OK.
7095 */
7096
7097 if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7098 /* keep track of how long the responsible thread runs */
7099
7100 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7101
7102 if (perfcontrol_failsafe_active == TRUE &&
7103 cur_thread->thread_id == perfcontrol_failsafe_tid) {
7104 perfcontrol_failsafe_thread_timer_last_seen = timer_grab(&cur_thread->user_timer) +
7105 timer_grab(&cur_thread->system_timer);
7106 }
7107
7108 simple_unlock(&sched_recommended_cores_lock);
7109
7110 /* we're already trying to solve the problem, so bail */
7111 return;
7112 }
7113
7114 /* The failsafe won't help if there are no more processors to enable */
7115 if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7116 return;
7117 }
7118
7119 uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7120
7121 /* Use the maintenance thread as our canary in the coal mine */
7122 thread_t m_thread = sched_maintenance_thread;
7123
7124 /* If it doesn't look bad, nothing to see here */
7125 if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7126 return;
7127 }
7128
7129 /* It looks bad, take the lock to be sure */
7130 thread_lock(m_thread);
7131
7132 if (m_thread->runq == PROCESSOR_NULL ||
7133 (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
7134 m_thread->last_made_runnable_time >= too_long_ago) {
7135 /*
7136 * Maintenance thread is either on cpu or blocked, and
7137 * therefore wouldn't benefit from more cores
7138 */
7139 thread_unlock(m_thread);
7140 return;
7141 }
7142
7143 uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7144
7145 thread_unlock(m_thread);
7146
7147 /*
7148 * There are cores disabled at perfcontrol's recommendation, but the
7149 * system is so overloaded that the maintenance thread can't run.
7150 * That likely means that perfcontrol can't run either, so it can't fix
7151 * the recommendation. We have to kick in a failsafe to keep from starving.
7152 *
7153 * When the maintenance thread has been starved for too long,
7154 * ignore the recommendation from perfcontrol and light up all the cores.
7155 *
7156 * TODO: Consider weird states like boot, sleep, or debugger
7157 */
7158
7159 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7160
7161 if (perfcontrol_failsafe_active == TRUE) {
7162 simple_unlock(&sched_recommended_cores_lock);
7163 return;
7164 }
7165
7166 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7167 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
7168 perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
7169
7170 perfcontrol_failsafe_active = TRUE;
7171 perfcontrol_failsafe_activation_time = mach_absolute_time();
7172 perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7173 perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7174
7175 /* Capture some data about who screwed up (assuming that the thread on core is at fault) */
7176 task_t task = get_threadtask(cur_thread);
7177 perfcontrol_failsafe_pid = task_pid(task);
7178 strlcpy(perfcontrol_failsafe_name, proc_name_address(task->bsd_info), sizeof(perfcontrol_failsafe_name));
7179
7180 perfcontrol_failsafe_tid = cur_thread->thread_id;
7181
7182 /* Blame the thread for time it has run recently */
7183 uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7184
7185 uint64_t last_seen = timer_grab(&cur_thread->user_timer) + timer_grab(&cur_thread->system_timer);
7186
7187 /* Compute the start time of the bad behavior in terms of the thread's on core time */
7188 perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
7189 perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7190
7191 /* Ignore the previously recommended core configuration */
7192 sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
7193
7194 simple_unlock(&sched_recommended_cores_lock);
7195 }
7196
7197 /*
7198 * Now that our bacon has been saved by the failsafe, consider whether to turn it off
7199 *
7200 * Runs in the context of the maintenance thread, no locks held
7201 */
7202 static void
sched_recommended_cores_maintenance(void)7203 sched_recommended_cores_maintenance(void)
7204 {
7205 /* Common case - no failsafe, nothing to be done here */
7206 if (__probable(perfcontrol_failsafe_active == FALSE)) {
7207 return;
7208 }
7209
7210 uint64_t ctime = mach_absolute_time();
7211
7212 boolean_t print_diagnostic = FALSE;
7213 char p_name[FAILSAFE_NAME_LEN] = "";
7214
7215 spl_t s = splsched();
7216 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7217
7218 /* Check again, under the lock, to avoid races */
7219 if (perfcontrol_failsafe_active == FALSE) {
7220 goto out;
7221 }
7222
7223 /*
7224 * Ensure that the other cores get another few ticks to run some threads
7225 * If we don't have this hysteresis, the maintenance thread is the first
7226 * to run, and then it immediately kills the other cores
7227 */
7228 if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7229 goto out;
7230 }
7231
7232 /* Capture some diagnostic state under the lock so we can print it out later */
7233
7234 int pid = perfcontrol_failsafe_pid;
7235 uint64_t tid = perfcontrol_failsafe_tid;
7236
7237 uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
7238 perfcontrol_failsafe_thread_timer_at_start;
7239 uint32_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
7240 uint32_t rec_cores_after = perfcontrol_requested_recommended_cores;
7241 uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
7242 strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
7243
7244 print_diagnostic = TRUE;
7245
7246 /* Deactivate the failsafe and reinstate the requested recommendation settings */
7247
7248 perfcontrol_failsafe_deactivation_time = ctime;
7249 perfcontrol_failsafe_active = FALSE;
7250
7251 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7252 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
7253 perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
7254
7255 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7256
7257 out:
7258 simple_unlock(&sched_recommended_cores_lock);
7259 splx(s);
7260
7261 if (print_diagnostic) {
7262 uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
7263
7264 absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
7265 failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7266
7267 absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
7268 thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7269
7270 printf("recommended core failsafe kicked in for %lld ms "
7271 "likely due to %s[%d] thread 0x%llx spending "
7272 "%lld ms on cpu at realtime priority - "
7273 "new recommendation: 0x%x -> 0x%x\n",
7274 failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7275 rec_cores_before, rec_cores_after);
7276 }
7277 }
7278
7279 #endif /* __arm__ || __arm64__ */
7280
7281 kern_return_t
sched_processor_enable(processor_t processor,boolean_t enable)7282 sched_processor_enable(processor_t processor, boolean_t enable)
7283 {
7284 assert(preemption_enabled());
7285
7286 spl_t s = splsched();
7287 simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7288
7289 if (enable) {
7290 bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7291 } else {
7292 bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7293 }
7294
7295 #if __arm__ || __arm64__
7296 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7297 sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7298 } else {
7299 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7300 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7301 perfcontrol_requested_recommended_cores,
7302 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7303 }
7304 #else /* __arm__ || __arm64__ */
7305 sched_update_recommended_cores(usercontrol_requested_recommended_cores);
7306 #endif /* !__arm__ || __arm64__ */
7307
7308 simple_unlock(&sched_recommended_cores_lock);
7309 splx(s);
7310
7311 return KERN_SUCCESS;
7312 }
7313
7314
7315 /*
7316 * Apply a new recommended cores mask to the processors it affects
7317 * Runs after considering failsafes and such
7318 *
7319 * Iterate over processors and update their ->is_recommended field.
7320 * If a processor is running, we let it drain out at its next
7321 * quantum expiration or blocking point. If a processor is idle, there
7322 * may be more work for it to do, so IPI it.
7323 *
7324 * interrupts disabled, sched_recommended_cores_lock is held
7325 */
7326 static void
sched_update_recommended_cores(uint64_t recommended_cores)7327 sched_update_recommended_cores(uint64_t recommended_cores)
7328 {
7329 uint64_t needs_exit_idle_mask = 0x0;
7330
7331 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
7332 recommended_cores,
7333 #if __arm__ || __arm64__
7334 perfcontrol_failsafe_active, 0, 0);
7335 #else /* __arm__ || __arm64__ */
7336 0, 0, 0);
7337 #endif /* ! __arm__ || __arm64__ */
7338
7339 if (__builtin_popcountll(recommended_cores) == 0) {
7340 bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
7341 }
7342
7343 /* First set recommended cores */
7344 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7345 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7346 processor_set_t pset = pset_array[pset_id];
7347
7348 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7349 cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7350
7351 if (newly_recommended == 0) {
7352 /* Nothing to do */
7353 continue;
7354 }
7355
7356 pset_lock(pset);
7357
7358 for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0; cpu_id = lsb_next(newly_recommended, cpu_id)) {
7359 processor_t processor = processor_array[cpu_id];
7360 processor->is_recommended = TRUE;
7361 bit_set(pset->recommended_bitmask, processor->cpu_id);
7362
7363 if (processor->state == PROCESSOR_IDLE) {
7364 if (processor != current_processor()) {
7365 bit_set(needs_exit_idle_mask, processor->cpu_id);
7366 }
7367 }
7368 if (processor->state != PROCESSOR_OFF_LINE) {
7369 os_atomic_inc(&processor_avail_count_user, relaxed);
7370 if (processor->processor_primary == processor) {
7371 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
7372 }
7373 SCHED(pset_made_schedulable)(processor, pset, false);
7374 }
7375 }
7376 pset_update_rt_stealable_state(pset);
7377
7378 pset_unlock(pset);
7379 }
7380 }
7381
7382 /* Now shutdown not recommended cores */
7383 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7384 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7385 processor_set_t pset = pset_array[pset_id];
7386
7387 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7388 cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
7389
7390 if (newly_unrecommended == 0) {
7391 /* Nothing to do */
7392 continue;
7393 }
7394
7395 pset_lock(pset);
7396
7397 for (int cpu_id = lsb_first(newly_unrecommended); cpu_id >= 0; cpu_id = lsb_next(newly_unrecommended, cpu_id)) {
7398 processor_t processor = processor_array[cpu_id];
7399 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
7400
7401 processor->is_recommended = FALSE;
7402 bit_clear(pset->recommended_bitmask, processor->cpu_id);
7403 if (processor->state != PROCESSOR_OFF_LINE) {
7404 os_atomic_dec(&processor_avail_count_user, relaxed);
7405 if (processor->processor_primary == processor) {
7406 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
7407 }
7408 }
7409 pset_update_rt_stealable_state(pset);
7410
7411 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
7412 ipi_type = SCHED_IPI_IMMEDIATE;
7413 }
7414 SCHED(processor_queue_shutdown)(processor);
7415 /* pset unlocked */
7416
7417 SCHED(rt_queue_shutdown)(processor);
7418
7419 if (ipi_type != SCHED_IPI_NONE) {
7420 if (processor == current_processor()) {
7421 ast_on(AST_PREEMPT);
7422 } else {
7423 sched_ipi_perform(processor, ipi_type);
7424 }
7425 }
7426
7427 pset_lock(pset);
7428 }
7429 pset_unlock(pset);
7430 }
7431 }
7432
7433 #if defined(__x86_64__)
7434 commpage_update_active_cpus();
7435 #endif
7436 /* Issue all pending IPIs now that the pset lock has been dropped */
7437 for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
7438 processor_t processor = processor_array[cpuid];
7439 machine_signal_idle(processor);
7440 }
7441
7442 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
7443 needs_exit_idle_mask, 0, 0, 0);
7444 }
7445
7446 void
thread_set_options(uint32_t thopt)7447 thread_set_options(uint32_t thopt)
7448 {
7449 spl_t x;
7450 thread_t t = current_thread();
7451
7452 x = splsched();
7453 thread_lock(t);
7454
7455 t->options |= thopt;
7456
7457 thread_unlock(t);
7458 splx(x);
7459 }
7460
7461 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)7462 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
7463 {
7464 thread->pending_block_hint = block_hint;
7465 }
7466
7467 uint32_t
qos_max_parallelism(int qos,uint64_t options)7468 qos_max_parallelism(int qos, uint64_t options)
7469 {
7470 return SCHED(qos_max_parallelism)(qos, options);
7471 }
7472
7473 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)7474 sched_qos_max_parallelism(__unused int qos, uint64_t options)
7475 {
7476 host_basic_info_data_t hinfo;
7477 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
7478
7479
7480 /*
7481 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
7482 * implement their own qos_max_parallelism() interfaces.
7483 */
7484 assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
7485
7486 /* Query the machine layer for core information */
7487 __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
7488 (host_info_t)&hinfo, &count);
7489 assert(kret == KERN_SUCCESS);
7490
7491 if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
7492 return hinfo.logical_cpu;
7493 } else {
7494 return hinfo.physical_cpu;
7495 }
7496 }
7497
7498 int sched_allow_NO_SMT_threads = 1;
7499 bool
thread_no_smt(thread_t thread)7500 thread_no_smt(thread_t thread)
7501 {
7502 return sched_allow_NO_SMT_threads &&
7503 (thread->bound_processor == PROCESSOR_NULL) &&
7504 ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
7505 }
7506
7507 bool
processor_active_thread_no_smt(processor_t processor)7508 processor_active_thread_no_smt(processor_t processor)
7509 {
7510 return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
7511 }
7512
7513 #if __arm64__
7514
7515 /*
7516 * Set up or replace old timer with new timer
7517 *
7518 * Returns true if canceled old timer, false if it did not
7519 */
7520 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)7521 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
7522 {
7523 /*
7524 * Exchange deadline for new deadline, if old deadline was nonzero,
7525 * then I cancelled the callback, otherwise I didn't
7526 */
7527
7528 return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
7529 relaxed) != 0;
7530 }
7531
7532 #endif /* __arm64__ */
7533
7534 #if CONFIG_SCHED_EDGE
7535
7536 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
7537
7538 /*
7539 * sched_edge_pset_running_higher_bucket()
7540 *
7541 * Routine to calculate cumulative running counts for each scheduling
7542 * bucket. This effectively lets the load calculation calculate if a
7543 * cluster is running any threads at a QoS lower than the thread being
7544 * migrated etc.
7545 */
7546
7547 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)7548 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
7549 {
7550 bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
7551
7552 /* Edge Scheduler Optimization */
7553 for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
7554 sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
7555 for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
7556 running_higher[bucket]++;
7557 }
7558 }
7559 }
7560
7561 /*
7562 * sched_update_pset_load_average()
7563 *
7564 * Updates the load average for each sched bucket for a cluster.
7565 * This routine must be called with the pset lock held.
7566 */
7567 void
sched_update_pset_load_average(processor_set_t pset,uint64_t curtime)7568 sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
7569 {
7570 int avail_cpu_count = pset_available_cpu_count(pset);
7571 if (avail_cpu_count == 0) {
7572 /* Looks like the pset is not runnable any more; nothing to do here */
7573 return;
7574 }
7575
7576 /*
7577 * Edge Scheduler Optimization
7578 *
7579 * See if more callers of this routine can pass in timestamps to avoid the
7580 * mach_absolute_time() call here.
7581 */
7582
7583 if (!curtime) {
7584 curtime = mach_absolute_time();
7585 }
7586 uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
7587 int64_t delta_ticks = curtime - last_update;
7588 if (delta_ticks < 0) {
7589 return;
7590 }
7591
7592 uint64_t delta_nsecs = 0;
7593 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
7594
7595 if (__improbable(delta_nsecs > UINT32_MAX)) {
7596 delta_nsecs = UINT32_MAX;
7597 }
7598
7599 #if CONFIG_SCHED_EDGE
7600 /* Update the shared resource load on the pset */
7601 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
7602 uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
7603 uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
7604 uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
7605 uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
7606 if (old_shared_load != new_shared_load) {
7607 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
7608 }
7609 }
7610 #endif /* CONFIG_SCHED_EDGE */
7611
7612 uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
7613 sched_edge_pset_running_higher_bucket(pset, running_higher);
7614
7615 for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
7616 uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
7617 uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
7618 uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
7619
7620 /*
7621 * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
7622 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
7623 * new load averga needs to be shifted before it can be added to the old load average.
7624 */
7625 uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
7626
7627 /*
7628 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
7629 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
7630 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
7631 */
7632 int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
7633 boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
7634 boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
7635 uint64_t load_average;
7636 if (load_uptick || load_downtick) {
7637 load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
7638 } else {
7639 /* Indicates a loaded system; use EWMA for load average calculation */
7640 load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
7641 }
7642 os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
7643 if (load_average != old_load_average) {
7644 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
7645 }
7646 }
7647 os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
7648 }
7649
7650 void
sched_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)7651 sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
7652 {
7653 pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
7654 uint64_t avg_thread_execution_time = 0;
7655
7656 os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
7657 old_execution_time_packed.pset_execution_time_packed,
7658 new_execution_time_packed.pset_execution_time_packed, relaxed, {
7659 uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
7660 int64_t delta_ticks = curtime - last_update;
7661 if (delta_ticks < 0) {
7662 /*
7663 * Its possible that another CPU came in and updated the pset_execution_time
7664 * before this CPU could do it. Since the average execution time is meant to
7665 * be an approximate measure per cluster, ignore the older update.
7666 */
7667 os_atomic_rmw_loop_give_up(return );
7668 }
7669 uint64_t delta_nsecs = 0;
7670 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
7671
7672 uint64_t nanotime = 0;
7673 absolutetime_to_nanoseconds(execution_time, &nanotime);
7674 uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
7675
7676 uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
7677 uint64_t new_execution_time = (execution_time_us * delta_nsecs);
7678
7679 avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
7680 new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
7681 new_execution_time_packed.pset_execution_time_last_update = curtime;
7682 });
7683 if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
7684 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
7685 }
7686 }
7687
7688 uint64_t
sched_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)7689 sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
7690 {
7691 return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
7692 }
7693
7694 #else /* CONFIG_SCHED_EDGE */
7695
7696 void
sched_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)7697 sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
7698 {
7699 int non_rt_load = pset->pset_runq.count;
7700 int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
7701 int new_load_average = ((int)pset->load_average + load) >> 1;
7702
7703 pset->load_average = new_load_average;
7704 #if (DEVELOPMENT || DEBUG)
7705 #if __AMP__
7706 if (pset->pset_cluster_type == PSET_AMP_P) {
7707 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
7708 }
7709 #endif
7710 #endif
7711 }
7712
7713 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)7714 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
7715 {
7716 }
7717
7718 #endif /* CONFIG_SCHED_EDGE */
7719
7720 /* pset is locked */
7721 static bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)7722 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
7723 {
7724 int cpuid = processor->cpu_id;
7725 #if defined(__x86_64__)
7726 if (sched_avoid_cpu0 && (cpuid == 0)) {
7727 return false;
7728 }
7729 #endif
7730
7731 cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
7732
7733 return bit_test(fasttrack_map, cpuid);
7734 }
7735
7736 /* pset is locked */
7737 static processor_t
choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)7738 choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
7739 {
7740 #if defined(__x86_64__)
7741 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
7742 #else
7743 const bool avoid_cpu0 = false;
7744 #endif
7745 cpumap_t cpu_map;
7746
7747 try_again:
7748 cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
7749 if (skip_processor) {
7750 bit_clear(cpu_map, skip_processor->cpu_id);
7751 }
7752 if (skip_spills) {
7753 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
7754 }
7755
7756 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7757 bit_clear(cpu_map, 0);
7758 }
7759
7760 cpumap_t primary_map = cpu_map & pset->primary_map;
7761 if (avoid_cpu0) {
7762 primary_map = bit_ror64(primary_map, 1);
7763 }
7764
7765 int rotid = lsb_first(primary_map);
7766 if (rotid >= 0) {
7767 int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
7768
7769 processor_t processor = processor_array[cpuid];
7770
7771 return processor;
7772 }
7773
7774 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
7775 goto out;
7776 }
7777
7778 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7779 /* Also avoid cpu1 */
7780 bit_clear(cpu_map, 1);
7781 }
7782
7783 /* Consider secondary processors whose primary is actually running a realtime thread */
7784 cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
7785 if (avoid_cpu0) {
7786 /* Also avoid cpu1 */
7787 secondary_map = bit_ror64(secondary_map, 2);
7788 }
7789 rotid = lsb_first(secondary_map);
7790 if (rotid >= 0) {
7791 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
7792
7793 processor_t processor = processor_array[cpuid];
7794
7795 return processor;
7796 }
7797
7798 /* Consider secondary processors */
7799 secondary_map = cpu_map & ~pset->primary_map;
7800 if (avoid_cpu0) {
7801 /* Also avoid cpu1 */
7802 secondary_map = bit_ror64(secondary_map, 2);
7803 }
7804 rotid = lsb_first(secondary_map);
7805 if (rotid >= 0) {
7806 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
7807
7808 processor_t processor = processor_array[cpuid];
7809
7810 return processor;
7811 }
7812
7813 /*
7814 * I was hoping the compiler would optimize
7815 * this away when avoid_cpu0 is const bool false
7816 * but it still complains about the assignmnent
7817 * in that case.
7818 */
7819 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7820 #if defined(__x86_64__)
7821 avoid_cpu0 = false;
7822 #else
7823 assert(0);
7824 #endif
7825 goto try_again;
7826 }
7827
7828 out:
7829 if (skip_processor) {
7830 return PROCESSOR_NULL;
7831 }
7832
7833 /*
7834 * If we didn't find an obvious processor to choose, but there are still more CPUs
7835 * not already running realtime threads than realtime threads in the realtime run queue,
7836 * this thread belongs in this pset, so choose some other processor in this pset
7837 * to ensure the thread is enqueued here.
7838 */
7839 cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
7840 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
7841 cpu_map = non_realtime_map;
7842 assert(cpu_map != 0);
7843 int cpuid = bit_first(cpu_map);
7844 assert(cpuid >= 0);
7845 return processor_array[cpuid];
7846 }
7847
7848 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
7849 goto skip_secondaries;
7850 }
7851
7852 non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
7853 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
7854 cpu_map = non_realtime_map;
7855 assert(cpu_map != 0);
7856 int cpuid = bit_first(cpu_map);
7857 assert(cpuid >= 0);
7858 return processor_array[cpuid];
7859 }
7860
7861 skip_secondaries:
7862 return PROCESSOR_NULL;
7863 }
7864
7865 /*
7866 * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
7867 * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
7868 *
7869 * pset is locked.
7870 */
7871 static processor_t
choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)7872 choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
7873 {
7874 uint64_t furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon);
7875 processor_t fd_processor = PROCESSOR_NULL;
7876 int lowest_priority = max_pri;
7877
7878 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
7879 if (skip_processor) {
7880 bit_clear(cpu_map, skip_processor->cpu_id);
7881 }
7882 if (skip_spills) {
7883 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
7884 }
7885
7886 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
7887 processor_t processor = processor_array[cpuid];
7888
7889 if (processor->current_pri > lowest_priority) {
7890 continue;
7891 }
7892
7893 if (processor->current_pri < lowest_priority) {
7894 lowest_priority = processor->current_pri;
7895 furthest_deadline = processor->deadline;
7896 fd_processor = processor;
7897 continue;
7898 }
7899
7900 if (processor->deadline > furthest_deadline) {
7901 furthest_deadline = processor->deadline;
7902 fd_processor = processor;
7903 }
7904 }
7905
7906 if (fd_processor) {
7907 return fd_processor;
7908 }
7909
7910 /*
7911 * There is a race condition possible when there are multiple processor sets.
7912 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
7913 * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
7914 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
7915 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
7916 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
7917 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
7918 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
7919 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
7920 *
7921 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
7922 * on the run queue of that pset.
7923 */
7924 if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
7925 cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
7926 assert(skip_processor == PROCESSOR_NULL);
7927 assert(skip_spills == false);
7928
7929 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
7930 processor_t processor = processor_array[cpuid];
7931
7932 if (processor->current_pri > lowest_priority) {
7933 continue;
7934 }
7935
7936 if (processor->current_pri < lowest_priority) {
7937 lowest_priority = processor->current_pri;
7938 furthest_deadline = processor->deadline;
7939 fd_processor = processor;
7940 continue;
7941 }
7942
7943 if (processor->deadline > furthest_deadline) {
7944 furthest_deadline = processor->deadline;
7945 fd_processor = processor;
7946 }
7947 }
7948 }
7949
7950 return fd_processor;
7951 }
7952
7953 /* pset is locked */
7954 static processor_t
choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)7955 choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
7956 {
7957 bool skip_spills = true;
7958 bool include_ast_urgent_pending_cpus = false;
7959
7960 processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
7961 if (next_processor != PROCESSOR_NULL) {
7962 return next_processor;
7963 }
7964
7965 next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
7966 return next_processor;
7967 }
7968
7969 #if defined(__x86_64__)
7970 /* pset is locked */
7971 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)7972 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
7973 {
7974 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
7975 int nbackup_cpus = 0;
7976
7977 if (include_backups && rt_runq_is_low_latency(pset)) {
7978 nbackup_cpus = sched_rt_n_backup_processors;
7979 }
7980
7981 cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
7982 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7983 bit_clear(cpu_map, 0);
7984 }
7985 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
7986 }
7987
7988 /* pset is locked */
7989 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)7990 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
7991 {
7992 int nbackup_cpus = 0;
7993
7994 if (include_backups && rt_runq_is_low_latency(pset)) {
7995 nbackup_cpus = sched_rt_n_backup_processors;
7996 }
7997
7998 cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
7999 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8000 }
8001 #endif
8002
8003 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)8004 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
8005 {
8006 if (!processor->is_recommended) {
8007 return false;
8008 }
8009 bool ok_to_run_realtime_thread = true;
8010 #if defined(__x86_64__)
8011 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
8012 if (spill_pending) {
8013 return true;
8014 }
8015 if (processor->cpu_id == 0) {
8016 if (sched_avoid_cpu0 == 1) {
8017 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
8018 } else if (sched_avoid_cpu0 == 2) {
8019 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
8020 }
8021 } else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
8022 ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
8023 } else if (processor->processor_primary != processor) {
8024 ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
8025 }
8026 #else
8027 (void)pset;
8028 (void)processor;
8029 (void)as_backup;
8030 #endif
8031 return ok_to_run_realtime_thread;
8032 }
8033
8034 void
sched_pset_made_schedulable(__unused processor_t processor,processor_set_t pset,boolean_t drop_lock)8035 sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
8036 {
8037 if (drop_lock) {
8038 pset_unlock(pset);
8039 }
8040 }
8041
8042 void
thread_set_no_smt(bool set)8043 thread_set_no_smt(bool set)
8044 {
8045 if (!system_is_SMT) {
8046 /* Not a machine that supports SMT */
8047 return;
8048 }
8049
8050 thread_t thread = current_thread();
8051
8052 spl_t s = splsched();
8053 thread_lock(thread);
8054 if (set) {
8055 thread->sched_flags |= TH_SFLAG_NO_SMT;
8056 }
8057 thread_unlock(thread);
8058 splx(s);
8059 }
8060
8061 bool
thread_get_no_smt(void)8062 thread_get_no_smt(void)
8063 {
8064 return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8065 }
8066
8067 extern void task_set_no_smt(task_t);
8068 void
task_set_no_smt(task_t task)8069 task_set_no_smt(task_t task)
8070 {
8071 if (!system_is_SMT) {
8072 /* Not a machine that supports SMT */
8073 return;
8074 }
8075
8076 if (task == TASK_NULL) {
8077 task = current_task();
8078 }
8079
8080 task_lock(task);
8081 task->t_flags |= TF_NO_SMT;
8082 task_unlock(task);
8083 }
8084
8085 #if DEBUG || DEVELOPMENT
8086 extern void sysctl_task_set_no_smt(char no_smt);
8087 void
sysctl_task_set_no_smt(char no_smt)8088 sysctl_task_set_no_smt(char no_smt)
8089 {
8090 if (!system_is_SMT) {
8091 /* Not a machine that supports SMT */
8092 return;
8093 }
8094
8095 task_t task = current_task();
8096
8097 task_lock(task);
8098 if (no_smt == '1') {
8099 task->t_flags |= TF_NO_SMT;
8100 }
8101 task_unlock(task);
8102 }
8103
8104 extern char sysctl_task_get_no_smt(void);
8105 char
sysctl_task_get_no_smt(void)8106 sysctl_task_get_no_smt(void)
8107 {
8108 task_t task = current_task();
8109
8110 if (task->t_flags & TF_NO_SMT) {
8111 return '1';
8112 }
8113 return '0';
8114 }
8115 #endif /* DEVELOPMENT || DEBUG */
8116
8117
8118 __private_extern__ void
thread_bind_cluster_type(thread_t thread,char cluster_type,bool soft_bound)8119 thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8120 {
8121 #if __AMP__
8122 spl_t s = splsched();
8123 thread_lock(thread);
8124 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8125 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8126 if (soft_bound) {
8127 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8128 }
8129 switch (cluster_type) {
8130 case 'e':
8131 case 'E':
8132 if (pset0.pset_cluster_type == PSET_AMP_E) {
8133 thread->th_bound_cluster_id = pset0.pset_id;
8134 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8135 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8136 }
8137 break;
8138 case 'p':
8139 case 'P':
8140 if (pset0.pset_cluster_type == PSET_AMP_P) {
8141 thread->th_bound_cluster_id = pset0.pset_id;
8142 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8143 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8144 }
8145 break;
8146 default:
8147 break;
8148 }
8149 thread_unlock(thread);
8150 splx(s);
8151
8152 if (thread == current_thread()) {
8153 thread_block(THREAD_CONTINUE_NULL);
8154 }
8155 #else /* __AMP__ */
8156 (void)thread;
8157 (void)cluster_type;
8158 (void)soft_bound;
8159 #endif /* __AMP__ */
8160 }
8161
8162 extern uint32_t thread_bound_cluster_id(thread_t thread);
8163 uint32_t
thread_bound_cluster_id(thread_t thread)8164 thread_bound_cluster_id(thread_t thread)
8165 {
8166 return thread->th_bound_cluster_id;
8167 }
8168
8169 __private_extern__ kern_return_t
thread_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)8170 thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8171 {
8172 #if __AMP__
8173
8174 processor_set_t pset = NULL;
8175 if (options & (THREAD_BIND_SOFT | THREAD_BIND_ELIGIBLE_ONLY)) {
8176 /* Validate the inputs for the bind case */
8177 int max_clusters = ml_get_cluster_count();
8178 if (cluster_id >= max_clusters) {
8179 /* Invalid cluster id */
8180 return KERN_INVALID_ARGUMENT;
8181 }
8182 pset = pset_array[cluster_id];
8183 if (pset == NULL) {
8184 /* Cluster has not been initialized yet */
8185 return KERN_INVALID_ARGUMENT;
8186 }
8187 if (options & THREAD_BIND_ELIGIBLE_ONLY) {
8188 if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
8189 /* Thread is not recommended for the cluster type */
8190 return KERN_INVALID_POLICY;
8191 }
8192 }
8193 }
8194
8195 if (options & THREAD_UNBIND) {
8196 /* If the thread was actually not bound to some cluster, nothing to do here */
8197 if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8198 return KERN_SUCCESS;
8199 }
8200 }
8201
8202 spl_t s = splsched();
8203 thread_lock(thread);
8204
8205 /* Unbind the thread from its previous bound state */
8206 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8207 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8208
8209 if (options & THREAD_UNBIND) {
8210 /* Nothing more to do here */
8211 goto thread_bind_cluster_complete;
8212 }
8213
8214 if (options & THREAD_BIND_SOFT) {
8215 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8216 }
8217 thread->th_bound_cluster_id = cluster_id;
8218
8219 thread_bind_cluster_complete:
8220 thread_unlock(thread);
8221 splx(s);
8222
8223 if (thread == current_thread()) {
8224 thread_block(THREAD_CONTINUE_NULL);
8225 }
8226 #else /* __AMP__ */
8227 (void)thread;
8228 (void)cluster_id;
8229 (void)options;
8230 #endif /* __AMP__ */
8231 return KERN_SUCCESS;
8232 }
8233
8234 #if DEVELOPMENT || DEBUG
8235 extern int32_t sysctl_get_bound_cpuid(void);
8236 int32_t
sysctl_get_bound_cpuid(void)8237 sysctl_get_bound_cpuid(void)
8238 {
8239 int32_t cpuid = -1;
8240 thread_t self = current_thread();
8241
8242 processor_t processor = self->bound_processor;
8243 if (processor == NULL) {
8244 cpuid = -1;
8245 } else {
8246 cpuid = processor->cpu_id;
8247 }
8248
8249 return cpuid;
8250 }
8251
8252 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
8253 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)8254 sysctl_thread_bind_cpuid(int32_t cpuid)
8255 {
8256 processor_t processor = PROCESSOR_NULL;
8257
8258 if (cpuid == -1) {
8259 goto unbind;
8260 }
8261
8262 if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
8263 return KERN_INVALID_VALUE;
8264 }
8265
8266 processor = processor_array[cpuid];
8267 if (processor == PROCESSOR_NULL) {
8268 return KERN_INVALID_VALUE;
8269 }
8270
8271 #if __AMP__
8272
8273 thread_t thread = current_thread();
8274
8275 if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
8276 if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
8277 /* Cannot hard-bind an already hard-cluster-bound thread */
8278 return KERN_NOT_SUPPORTED;
8279 }
8280 }
8281
8282 #endif /* __AMP__ */
8283
8284 unbind:
8285 thread_bind(processor);
8286
8287 thread_block(THREAD_CONTINUE_NULL);
8288 return KERN_SUCCESS;
8289 }
8290
8291 extern char sysctl_get_task_cluster_type(void);
8292 char
sysctl_get_task_cluster_type(void)8293 sysctl_get_task_cluster_type(void)
8294 {
8295 task_t task = current_task();
8296 processor_set_t pset_hint = task->pset_hint;
8297
8298 if (!pset_hint) {
8299 return '0';
8300 }
8301
8302 #if __AMP__
8303 if (pset_hint->pset_cluster_type == PSET_AMP_E) {
8304 return 'E';
8305 } else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
8306 return 'P';
8307 }
8308 #endif
8309
8310 return '0';
8311 }
8312
8313 #if __AMP__
8314 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)8315 find_pset_of_type(pset_cluster_type_t t)
8316 {
8317 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8318 if (node->pset_cluster_type != t) {
8319 continue;
8320 }
8321
8322 processor_set_t pset = PROCESSOR_SET_NULL;
8323 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8324 pset = pset_array[pset_id];
8325 /* Prefer one with recommended processsors */
8326 if (pset->recommended_bitmask != 0) {
8327 assert(pset->pset_cluster_type == t);
8328 return pset;
8329 }
8330 }
8331 /* Otherwise return whatever was found last */
8332 return pset;
8333 }
8334
8335 return PROCESSOR_SET_NULL;
8336 }
8337 #endif
8338
8339 extern void sysctl_task_set_cluster_type(char cluster_type);
8340 void
sysctl_task_set_cluster_type(char cluster_type)8341 sysctl_task_set_cluster_type(char cluster_type)
8342 {
8343 task_t task = current_task();
8344 processor_set_t pset_hint = PROCESSOR_SET_NULL;
8345
8346 #if __AMP__
8347 switch (cluster_type) {
8348 case 'e':
8349 case 'E':
8350 pset_hint = find_pset_of_type(PSET_AMP_E);
8351 break;
8352 case 'p':
8353 case 'P':
8354 pset_hint = find_pset_of_type(PSET_AMP_P);
8355 break;
8356 default:
8357 break;
8358 }
8359
8360 if (pset_hint) {
8361 task_lock(task);
8362 task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
8363 task->pset_hint = pset_hint;
8364 task_unlock(task);
8365
8366 thread_block(THREAD_CONTINUE_NULL);
8367 }
8368 #else
8369 (void)cluster_type;
8370 (void)task;
8371 (void)pset_hint;
8372 #endif
8373 }
8374
8375 #endif /* DEVELOPMENT || DEBUG */
8376