1 /*
2 * Copyright (c) 2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <stdint.h>
30
31 #include <kern/bits.h>
32 #include <kern/kern_types.h>
33 #include <kern/processor.h>
34 #include <kern/sched_clutch.h>
35 #include <kern/sched_common.h>
36 #include <kern/sched_prim.h>
37 #include <kern/sched_rt.h>
38 #include <kern/thread.h>
39 #include <kern/queue.h>
40
41 #include <sys/kdebug_kernel.h>
42
43 #include <os/atomic_private.h>
44
45 #include <machine/machine_routines.h>
46
47 #ifdef KDBG_MACOS_RELEASE
48 #define KTRC KDBG_MACOS_RELEASE
49 #else
50 #define KTRC KDBG_RELEASE
51 #endif
52
53 #pragma mark - Constants and Tunables
54
55 #if (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS)
56 #include <kern/startup.h>
57
58 /*
59 * Tunables controlling how xnu initializes the realtime matrix. CLPC can
60 * override their effects with sched_perfcontrol interfaces.
61 */
62
63 TUNABLE(unsigned int, sched_rt_spill_policy, "sched_rt_spill_policy", 1);
64
65 TUNABLE(unsigned, sched_rt_steal_policy, "sched_rt_steal_policy", 2);
66 #endif /* (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS) */
67
68 uint32_t rt_deadline_epsilon;
69 uint32_t rt_constraint_threshold;
70 /* epsilon for comparing RT deadlines */
71 int rt_deadline_epsilon_us = 100;
72 uint32_t max_rt_quantum;
73 uint32_t min_rt_quantum;
74 int sched_allow_rt_smt = 1;
75 int sched_rt_runq_strict_priority = false;
76
77 int
sched_get_rt_deadline_epsilon(void)78 sched_get_rt_deadline_epsilon(void)
79 {
80 return rt_deadline_epsilon_us;
81 }
82
83 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)84 sched_set_rt_deadline_epsilon(int new_epsilon_us)
85 {
86 rt_deadline_epsilon_us = new_epsilon_us;
87
88 uint64_t abstime;
89 clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
90 assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
91 rt_deadline_epsilon = (uint32_t)abstime;
92 }
93
94 #pragma mark - Initialization
95
96 static int sched_rt_max_clusters = 0;
97
98 void
sched_realtime_timebase_init(void)99 sched_realtime_timebase_init(void)
100 {
101 uint64_t abstime;
102
103 /* smallest rt computation (50 us) */
104 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
105 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
106 min_rt_quantum = (uint32_t)abstime;
107
108 /* maximum rt computation (50 ms) */
109 clock_interval_to_absolutetime_interval(
110 50, 1000 * NSEC_PER_USEC, &abstime);
111 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
112 max_rt_quantum = (uint32_t)abstime;
113
114 /* constraint threshold for sending backup IPIs (4 ms) */
115 clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
116 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
117 rt_constraint_threshold = (uint32_t)abstime;
118
119 /* epsilon for comparing deadlines */
120 sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
121 }
122
123 #if CONFIG_SCHED_EDGE
124 /* forward-declare config utility */
125 static void
126 sched_rt_config_pset_push(processor_set_t pset);
127 #endif /* CONFIG_SCHED_EDGE */
128
129 static void
rt_init_completed(void)130 rt_init_completed(void)
131 {
132 /* This should be unified with sched_edge_max_clusters and moved to a common location. <rdar://145162647> */
133 sched_rt_max_clusters = ml_get_cluster_count();
134
135 /* Realtime spill/steal are only supported on platforms with the edge scheduler. */
136 #if CONFIG_SCHED_EDGE
137 /* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */
138 spl_t s = splsched();
139 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
140 for (int src_cluster_id = 0; src_cluster_id < sched_rt_max_clusters; src_cluster_id++) {
141 processor_set_t src_pset = pset_array[src_cluster_id];
142 assert3p(src_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */
143
144 /* For each cluster, set all its outgoing edge parameters */
145 for (int dst_cluster_id = 0; dst_cluster_id < sched_rt_max_clusters; dst_cluster_id++) {
146 if (dst_cluster_id == src_cluster_id) {
147 continue;
148 }
149 processor_set_t dst_pset = pset_array[dst_cluster_id];
150 assert3p(dst_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */
151
152 bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type);
153 if (clusters_homogenous) {
154 /* Default realtime policy: spill allowed among homogeneous psets. */
155 sched_rt_config_set((pset_id_t) src_cluster_id, (pset_id_t) dst_cluster_id, (sched_clutch_edge) {
156 .sce_migration_allowed = true,
157 .sce_steal_allowed = true,
158 .sce_migration_weight = 0,
159 });
160 } else {
161 /* Default realtime policy: disallow spill among heterogeneous psets. */
162 sched_rt_config_set((pset_id_t) src_cluster_id, (pset_id_t) dst_cluster_id, (sched_clutch_edge) {
163 .sce_migration_allowed = false,
164 .sce_steal_allowed = false,
165 .sce_migration_weight = 0,
166 });
167 }
168 }
169 }
170
171
172 for (pset_id_t pset_id = 0; pset_id < sched_rt_max_clusters; pset_id++) {
173 sched_rt_config_pset_push(pset_array[pset_id]);
174 }
175
176 simple_unlock(&sched_available_cores_lock);
177 splx(s);
178 #endif /* CONFIG_SCHED_EDGE */
179 }
180
181 static void
pset_rt_init(processor_set_t pset)182 pset_rt_init(processor_set_t pset)
183 {
184 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
185 int i = pri - BASEPRI_RTQUEUES;
186 rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
187 queue_init(&rqi->pri_queue);
188 rqi->pri_count = 0;
189 rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
190 rqi->pri_constraint = RT_CONSTRAINT_NONE;
191 }
192 os_atomic_init(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE);
193
194 rt_queue_t rt_runq = &pset->rt_runq;
195 os_atomic_init(&rt_runq->count, 0);
196 os_atomic_init(&rt_runq->earliest_deadline, RT_DEADLINE_NONE);
197 os_atomic_init(&rt_runq->constraint, RT_CONSTRAINT_NONE);
198 os_atomic_init(&rt_runq->ed_index, NOPRI);
199 bzero(&rt_runq->bitmap, sizeof(rt_runq->bitmap));
200 bzero(&rt_runq->runq_stats, sizeof(rt_runq->runq_stats));
201
202 #if __AMP__
203 /*
204 * Initialize spill/steal search orders as invalid to prevent spill/steal
205 * before the matrix is configured.
206 */
207 bzero(pset->sched_rt_edges, sizeof(pset->sched_rt_edges));
208 for (pset_id_t i = 0; i < MAX_PSETS - 1; i++) {
209 pset->sched_rt_spill_search_order.spso_search_order[i] = PSET_ID_INVALID;
210 #if CONFIG_SCHED_EDGE
211 pset->sched_rt_steal_search_order.spso_search_order[i] = PSET_ID_INVALID;
212 #endif /* CONFIG_SCHED_EDGE */
213 }
214 #endif /* __AMP__ */
215 }
216
217 #pragma mark - Realtime Scheduler/CLPC interface
218
219 #if CONFIG_SCHED_EDGE
220 void
sched_rt_config_set(uint8_t src_pset,uint8_t dst_pset,sched_clutch_edge edge_config)221 sched_rt_config_set(
222 uint8_t src_pset,
223 uint8_t dst_pset,
224 sched_clutch_edge edge_config)
225 {
226 assert(src_pset != dst_pset || !edge_config.sce_migration_allowed); /* No self-edges. */
227 os_atomic_store(&pset_array[src_pset]->sched_rt_edges[dst_pset], edge_config, relaxed);
228 }
229
230 sched_clutch_edge
sched_rt_config_get(uint8_t src_pset,uint8_t dst_pset)231 sched_rt_config_get(
232 uint8_t src_pset,
233 uint8_t dst_pset)
234 {
235 return os_atomic_load(&pset_array[src_pset]->sched_rt_edges[dst_pset], relaxed);
236 }
237
238 void
sched_rt_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_requests,uint64_t num_psets)239 sched_rt_matrix_get(
240 sched_clutch_edge *edge_matrix,
241 bool *edge_requests,
242 uint64_t num_psets)
243 {
244 uint64_t edge_index = 0;
245 for (uint8_t src_pset = 0; src_pset < num_psets; src_pset++) {
246 for (uint8_t dst_pset = 0; dst_pset < num_psets; dst_pset++) {
247 if (edge_requests[edge_index]) {
248 edge_matrix[edge_index] = sched_rt_config_get(src_pset, dst_pset);
249 }
250 edge_index++;
251 }
252 }
253 }
254
255 /*
256 * sched_rt_config_pset_push()
257 *
258 * After using sched_rt_config_set() to update edge tunables outgoing from a particular source
259 * pset, this function should be called in order to propagate the updates to derived metadata for
260 * the pset, such as search orders for outgoing spill and steal.
261 */
262 static void
sched_rt_config_pset_push(processor_set_t pset)263 sched_rt_config_pset_push(processor_set_t pset)
264 {
265 assert3u(pset->pset_id, <, UINT8_MAX);
266
267 sched_pset_search_order_sort_data_t spill_datas[MAX_PSETS - 1], steal_datas[MAX_PSETS - 1];
268 uint num_spill_datas = 0, num_steal_datas = 0;
269 for (pset_id_t other_pset_id = 0; other_pset_id < sched_rt_max_clusters; other_pset_id++) {
270 if (pset->pset_id == other_pset_id) {
271 continue; /* No self-edges. */
272 }
273 /* Spill */
274 sched_clutch_edge out_edge = sched_rt_config_get((pset_id_t)pset->pset_cluster_id, other_pset_id);
275 if (out_edge.sce_migration_allowed) {
276 spill_datas[num_spill_datas++] = (sched_pset_search_order_sort_data_t) {
277 .spsosd_src_pset = pset,
278 .spsosd_migration_weight = out_edge.sce_migration_weight,
279 .spsosd_dst_pset_id = other_pset_id
280 };
281 }
282 /* Steal */
283 sched_clutch_edge in_edge = sched_rt_config_get(other_pset_id, (pset_id_t)pset->pset_cluster_id);
284 if (in_edge.sce_steal_allowed) {
285 steal_datas[num_steal_datas++] = (sched_pset_search_order_sort_data_t) {
286 .spsosd_src_pset = pset,
287 .spsosd_migration_weight = in_edge.sce_migration_weight,
288 .spsosd_dst_pset_id = other_pset_id,
289 };
290 }
291 }
292 sched_pset_search_order_compute(&pset->sched_rt_spill_search_order, spill_datas, num_spill_datas, sched_edge_search_order_weight_then_locality_cmp);
293 sched_pset_search_order_compute(&pset->sched_rt_steal_search_order, steal_datas, num_steal_datas, sched_edge_search_order_weight_then_locality_cmp);
294 }
295
296 void
sched_rt_matrix_set(sched_clutch_edge * rt_matrix,bool * edge_changes,uint64_t num_psets)297 sched_rt_matrix_set(
298 sched_clutch_edge *rt_matrix,
299 bool *edge_changes,
300 uint64_t num_psets)
301 {
302 /* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */
303 spl_t s = splsched();
304 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
305
306 for (uint8_t src_pset_id = 0; src_pset_id < num_psets; src_pset_id++) {
307 for (uint8_t dst_pset_id = 0; dst_pset_id < num_psets; dst_pset_id++) {
308 const uint64_t rt_matrix_index = src_pset_id * num_psets + dst_pset_id;
309 if (edge_changes[rt_matrix_index]) {
310 sched_rt_config_set(src_pset_id, dst_pset_id, rt_matrix[rt_matrix_index]);
311 }
312 }
313 }
314
315 for (pset_id_t pset_id = 0; pset_id < num_psets; pset_id++) {
316 sched_rt_config_pset_push(pset_array[pset_id]);
317 }
318
319 simple_unlock(&sched_available_cores_lock);
320 splx(s);
321 }
322 #endif /* CONFIG_SCHED_EDGE */
323
324 #pragma mark - Scheduler Callouts
325
326 #if CONFIG_SCHED_SMT
327 /*
328 * SMT-aware callout for rt_choose_processor.
329 */
330 processor_t
sched_rtlocal_choose_processor_smt(processor_set_t starting_pset,processor_t processor,thread_t thread)331 sched_rtlocal_choose_processor_smt(
332 processor_set_t starting_pset,
333 processor_t processor,
334 thread_t thread)
335 {
336 processor_set_t nset = PROCESSOR_SET_NULL;
337 processor_set_t pset = starting_pset;
338 pset_node_t node = pset->node;
339
340 processor_t lc_processor = processor;
341 integer_t lowest_count = INT_MAX;
342 if (lc_processor != PROCESSOR_NULL) {
343 lowest_count = SCHED(processor_runq_count)(processor);
344 }
345
346 bool include_ast_urgent_pending_cpus = false;
347 cpumap_t ast_urgent_pending;
348 try_again:
349 ast_urgent_pending = 0;
350 int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
351 for (; consider_secondaries < 2; consider_secondaries++) {
352 pset = change_locked_pset(pset, starting_pset);
353 do {
354 cpumap_t available_map = pset_available_cpumap(pset);
355 if (available_map == 0) {
356 goto no_available_cpus;
357 }
358
359 processor = pset_choose_processor_for_realtime_thread_smt(pset, PROCESSOR_NULL, consider_secondaries, false);
360 if (processor) {
361 return processor;
362 }
363
364 if (consider_secondaries) {
365 processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
366 if (processor) {
367 /*
368 * Instead of looping through all the psets to find the global
369 * furthest deadline processor, preempt the first candidate found.
370 * The preempted thread will then find any other available far deadline
371 * processors to preempt.
372 */
373 return processor;
374 }
375
376 ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
377
378 if (rt_runq_count(pset) < lowest_count) {
379 int cpuid = bit_first(available_map);
380 assert(cpuid >= 0);
381 lc_processor = processor_array[cpuid];
382 lowest_count = rt_runq_count(pset);
383 }
384 }
385
386 no_available_cpus:
387 nset = next_pset(pset);
388
389 if (nset != starting_pset) {
390 pset = change_locked_pset(pset, nset);
391 }
392 } while (nset != starting_pset);
393 }
394
395 /* Short cut for single pset nodes */
396 if (bit_count(node->pset_map) == 1) {
397 if (lc_processor) {
398 pset_assert_locked(lc_processor->processor_set);
399 return lc_processor;
400 }
401 } else {
402 if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
403 /* See the comment in pset_choose_furthest_deadline_processor_for_realtime_thread() */
404 include_ast_urgent_pending_cpus = true;
405 goto try_again;
406 }
407 }
408
409 processor = lc_processor;
410
411 if (processor) {
412 pset = change_locked_pset(pset, processor->processor_set);
413 /* Check that chosen processor is still usable */
414 cpumap_t available_map = pset_available_cpumap(pset);
415 if (bit_test(available_map, processor->cpu_id)) {
416 return processor;
417 }
418
419 /* processor is no longer usable */
420 processor = PROCESSOR_NULL;
421 }
422
423 pset_assert_locked(pset);
424 pset_unlock(pset);
425 return PROCESSOR_NULL;
426 }
427 #else /* !CONFIG_SCHED_SMT */
428 /*
429 * Called with thread and starting_pset locked. The returned processor's pset is
430 * locked on return.
431 */
432 processor_t
sched_rt_choose_processor(const processor_set_t starting_pset,processor_t processor,thread_t thread)433 sched_rt_choose_processor(
434 const processor_set_t starting_pset,
435 processor_t processor,
436 thread_t thread)
437 {
438 assert3u(thread->sched_pri, >=, BASEPRI_RTQUEUES);
439 assert3u(thread->sched_pri, <=, MAXPRI);
440
441 /*
442 * In choose_starting_pset, we found a good candidate pset for this thread.
443 * Now, we pick the best processor to preempt, if there is one. It is also
444 * possible that conditions have changed and the thread should spill to
445 * another pset.
446 */
447
448 processor_set_t pset = starting_pset; /* Lock is held on this pset. */
449 pset_assert_locked(pset);
450
451 #if __AMP__
452 /*
453 * If there are processors with outstanding urgent preemptions, we consider
454 * them in a second pass. While we are changing pset locks here, it is
455 * possible a processor may resolve its outstanding urgent preemption and
456 * become eligible to run this thread. See comment in
457 * pset_choose_furthest_deadline_processor_for_realtime_thread().
458 */
459 bool found_ast_urgent_pending = false; /* Tracks whether any (eligible) processors have pending urgent ASTs. */
460 for (int include_ast_urgent_pending_cpus = 0; include_ast_urgent_pending_cpus < 2; include_ast_urgent_pending_cpus++) {
461 if (include_ast_urgent_pending_cpus && !found_ast_urgent_pending) {
462 break; /* Skip the second pass. */
463 }
464
465 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
466 while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, ~0, &istate)) {
467 /* Switch to the next pset. We need to check for null psets because
468 * we do not use acquire/release semantics for the spill order. */
469 processor_set_t nset = pset_array[istate.spis_pset_id];
470 if (__improbable(nset == PROCESSOR_SET_NULL)) {
471 continue;
472 }
473 pset = change_locked_pset(pset, nset);
474
475 processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false);
476 if (processor != PROCESSOR_NULL) {
477 /* We found a candidate processor on this pset to wake or preempt. */
478 pset_assert_locked(processor->processor_set);
479 return processor;
480 }
481
482 /* TODO <rdar://140219824>: Policy question of EDF vs targeting idle cores on another pset. */
483 processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
484 if (processor) {
485 /*
486 * Instead of looping through all the psets to find the global
487 * furthest deadline processor, preempt the first candidate found.
488 * The preempted thread will then find any other available far deadline
489 * processors to preempt.
490 */
491 pset_assert_locked(processor->processor_set);
492 return processor;
493 }
494
495 found_ast_urgent_pending = found_ast_urgent_pending || (pset->pending_AST_URGENT_cpu_mask != 0);
496 }
497 }
498
499 /*
500 * There was no obvious (idle or non-realtime) processor to run the thread.
501 * Instead, do EDF scheduling again on starting_pset, putting the thread on
502 * the run queue if there is no processor to preempt.
503 */
504
505 pset = change_locked_pset(pset, starting_pset);
506 #endif /* __AMP__ */
507
508 /* Check (again, for AMP systems) that there is no lower-priority or idle processor. */
509 processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false);
510 if (processor != PROCESSOR_NULL) {
511 /* We found a candidate processor on this pset to wake or preempt. */
512 pset_assert_locked(processor->processor_set);
513 return processor;
514 }
515
516 processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, true);
517 if (processor == PROCESSOR_NULL) {
518 /* Choose an arbitrary available and recommended processor from the pset.
519 * It won't get preempted anyways, since this thread has a later
520 * deadline. */
521 int processor_id = lsb_first(pset_available_cpumap(pset));
522
523 /* starting_pset had available, recommended processors coming into
524 * rt_choose_processor(), but that might have changed after dropping the
525 * pset lock. If there are no such processors, bail out here and let
526 * sched_edge_migrate_candidate() find a better starting pset. */
527 if (processor_id < 0) {
528 pset_unlock(pset);
529 return PROCESSOR_NULL;
530 }
531
532 processor = processor_array[processor_id];
533 }
534
535 pset_assert_locked(processor->processor_set);
536 return processor;
537 }
538 #endif /* !CONFIG_SCHED_SMT */
539
540 #if CONFIG_SCHED_EDGE
541 /*
542 * Called with stealing_pset locked and returns with stealing_pset locked but
543 * the lock will have been dropped if a thread is returned. The lock may have
544 * been temporarily dropped, even if no thread is returned.
545 */
546 thread_t
sched_rt_steal_thread(processor_set_t stealing_pset)547 sched_rt_steal_thread(processor_set_t stealing_pset)
548 {
549 uint64_t earliest_deadline = rt_runq_earliest_deadline(stealing_pset);
550 processor_set_t pset = stealing_pset;
551
552 /* Continue searching until there are no steal candidates found in a single iteration. */
553 while (true) {
554 processor_set_t target_pset = NULL;
555 uint64_t target_deadline;
556 if (__improbable(os_sub_overflow(earliest_deadline, rt_deadline_epsilon, &target_deadline))) {
557 target_deadline = 0;
558 }
559
560 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
561 while (sched_iterate_psets_ordered(stealing_pset, &stealing_pset->sched_rt_steal_search_order, ~BIT(stealing_pset->pset_id), &istate)) {
562 assert3s(istate.spis_pset_id, !=, stealing_pset->pset_id); /* stealing_pset's runqueue is drained by sched_rt_choose_processor */
563 const processor_set_t nset = pset_array[istate.spis_pset_id];
564 /* Check for null because we do not use acquire/release semantics for steal order. */
565 if (__improbable(nset == PROCESSOR_SET_NULL)) {
566 continue;
567 }
568 uint64_t nset_deadline = os_atomic_load(&nset->stealable_rt_threads_earliest_deadline, relaxed);
569 if (nset_deadline < target_deadline) {
570 target_pset = nset;
571 target_deadline = nset_deadline;
572 }
573 }
574
575 if (target_pset != PROCESSOR_SET_NULL) {
576 assert3u(target_deadline, !=, RT_DEADLINE_NONE);
577
578 /* target_pset is a candidate for steal. Check again under its pset lock. */
579
580 pset = change_locked_pset(pset, target_pset);
581 if (os_atomic_load(&pset->stealable_rt_threads_earliest_deadline, relaxed) <= target_deadline) {
582 /* Steal the next thread from target_pset's runqueue. */
583 thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
584 pset_update_rt_stealable_state(pset);
585 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
586
587 pset = change_locked_pset(pset, stealing_pset);
588 return new_thread;
589 } else {
590 /* Failed to steal (another pset stole first). Try again. */
591 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(THREAD_NULL), pset->pset_id, pset->cpu_set_low, 1);
592 pset = change_locked_pset(pset, stealing_pset);
593 /* Update earliest_deadline in case it changed while the stealing_pset lock was not held. */
594 earliest_deadline = rt_runq_earliest_deadline(pset);
595 continue;
596 }
597 } else {
598 /* No steal candidates, stop searching. */
599 break;
600 }
601 }
602 /* No stealable threads, return with stealing_pset locked. */
603 pset = change_locked_pset(pset, stealing_pset);
604 return THREAD_NULL;
605 }
606 #endif /* CONFIG_SCHED_EDGE */
607
608 /*
609 * processor's pset is locked, may drop and retake the lock
610 */
611 thread_t
sched_rt_choose_thread(processor_t processor)612 sched_rt_choose_thread(processor_t processor)
613 {
614 processor_set_t pset = processor->processor_set;
615 pset_assert_locked(pset);
616
617 if (SCHED(rt_steal_thread) != NULL) {
618 do {
619 rt_clear_pending_spill(processor, 2);
620 thread_t new_thread = SCHED(rt_steal_thread)(pset);
621 /* pset lock may have been dropped and retaken, is currently locked */
622 pset_assert_locked(pset);
623 if (new_thread != THREAD_NULL) {
624 /* Spill might have been set if the pset lock was dropped in steal. */
625 rt_clear_pending_spill(processor, 3);
626 return new_thread;
627 }
628 } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
629 }
630 rt_clear_pending_spill(processor, 5);
631
632 if (rt_runq_count(pset) > 0) {
633 thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
634 assert(new_thread != THREAD_NULL);
635 pset_update_rt_stealable_state(pset);
636 return new_thread;
637 }
638
639 return THREAD_NULL;
640 }
641
642 void
sched_rt_init_pset(processor_set_t pset)643 sched_rt_init_pset(processor_set_t pset)
644 {
645 pset_rt_init(pset);
646 }
647
648 void
sched_rt_init_completed(void)649 sched_rt_init_completed(void)
650 {
651 rt_init_completed();
652 }
653
654 void
sched_rt_queue_shutdown(processor_t processor)655 sched_rt_queue_shutdown(processor_t processor)
656 {
657 processor_set_t pset = processor->processor_set;
658 thread_t thread;
659 queue_head_t tqueue;
660
661 pset_lock(pset);
662
663 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
664 if (bit_count(pset_available_cpumap(pset)) > 0) {
665 pset_unlock(pset);
666 return;
667 }
668
669 queue_init(&tqueue);
670
671 while (rt_runq_count(pset) > 0) {
672 thread = rt_runq_dequeue(&pset->rt_runq);
673 enqueue_tail(&tqueue, &thread->runq_links);
674 }
675 sched_update_pset_load_average(pset, 0);
676 pset_update_rt_stealable_state(pset);
677 pset_unlock(pset);
678
679 qe_foreach_element_safe(thread, &tqueue, runq_links) {
680 remqueue(&thread->runq_links);
681
682 thread_lock(thread);
683
684 thread_setrun(thread, SCHED_TAILQ);
685
686 thread_unlock(thread);
687 }
688 }
689
690 /*
691 * Assumes RT lock is not held, and acquires splsched/rt_lock itself.
692 * Also records tracepoints for pset bitmasks under the pset lock.
693 */
694 void
sched_rt_runq_scan(sched_update_scan_context_t scan_context)695 sched_rt_runq_scan(sched_update_scan_context_t scan_context)
696 {
697 thread_t thread;
698
699 pset_node_t node = &pset_node0;
700 processor_set_t pset = node->psets;
701
702 spl_t s = splsched();
703 do {
704 while (pset != NULL) {
705 pset_lock(pset);
706
707 bitmap_t *map = pset->rt_runq.bitmap;
708 for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
709 rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
710
711 qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
712 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
713 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
714 }
715 }
716 }
717
718 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PSET_BITMASKS),
719 pset->pset_id,
720 pset->recommended_bitmask,
721 pset->perfcontrol_cpu_migration_bitmask,
722 pset->perfcontrol_cpu_preferred_bitmask);
723
724 pset_unlock(pset);
725
726 pset = pset->pset_list;
727 }
728 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
729 splx(s);
730 }
731
732 int64_t
sched_rt_runq_count_sum(void)733 sched_rt_runq_count_sum(void)
734 {
735 pset_node_t node = &pset_node0;
736 processor_set_t pset = node->psets;
737 int64_t count = 0;
738
739 do {
740 while (pset != NULL) {
741 count += pset->rt_runq.runq_stats.count_sum;
742
743 pset = pset->pset_list;
744 }
745 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
746
747 return count;
748 }
749
750 #pragma mark - Utilities
751
752 uint64_t
rt_deadline_add(uint64_t d,uint64_t e)753 rt_deadline_add(uint64_t d, uint64_t e)
754 {
755 uint64_t sum;
756 return os_add_overflow(d, e, &sum) ? RT_DEADLINE_NONE : sum;
757 }
758
759 cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)760 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
761 {
762 cpumap_t avail_map = pset_available_cpumap(pset);
763 #if CONFIG_SCHED_SMT
764 if (!sched_allow_rt_smt) {
765 /*
766 * Secondary CPUs are not allowed to run RT threads, so
767 * only primary CPUs should be included
768 */
769 avail_map &= pset->primary_map;
770 }
771 #endif /* CONFIG_SCHED_SMT */
772
773 return avail_map & ~pset->realtime_map;
774 }
775
776 /* pset is locked */
777 static processor_t
pset_choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)778 pset_choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
779 {
780 (void) consider_secondaries;
781 bool skip_spills = true;
782 bool include_ast_urgent_pending_cpus = false;
783
784 #if CONFIG_SCHED_SMT
785 processor_t next_processor = pset_choose_processor_for_realtime_thread_smt(pset, skip_processor, consider_secondaries, skip_spills);
786 #else /* CONFIG_SCHED_SMT */
787 processor_t next_processor = pset_choose_processor_for_realtime_thread(pset, skip_processor, skip_spills);
788 #endif /* CONFIG_SCHED_SMT */
789 if (next_processor != PROCESSOR_NULL) {
790 return next_processor;
791 }
792
793 next_processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
794 return next_processor;
795 }
796
797 #if CONFIG_SCHED_EDGE
798 /*
799 * Realtime Steal Utilities
800 *
801 * Realtime steal is only supported on platforms with the edge scheduler.
802 */
803
804 /* Update realtime stealable state. */
805 void
pset_update_rt_stealable_state(processor_set_t pset)806 pset_update_rt_stealable_state(processor_set_t pset)
807 {
808 pset_assert_locked(pset);
809 if (rt_pset_has_stealable_threads(pset)) {
810 os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, rt_runq_earliest_deadline(pset), relaxed);
811 } else {
812 os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE, relaxed);
813 }
814 }
815
816 bool
rt_pset_has_stealable_threads(processor_set_t pset)817 rt_pset_has_stealable_threads(processor_set_t pset)
818 {
819 cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
820
821 return rt_runq_count(pset) > bit_count(avail_map);
822 }
823
824 /*
825 * Returns the next processor to IPI for a migrating realtime thread. Realtime
826 * spill is only supported with the edge scheduler.
827 *
828 * Expects starting_pset to be locked. Returns false if starting_pset was never
829 * unlocked; otherwise, returns true with no lock held.
830 */
831 bool
rt_choose_next_processor_for_spill_IPI(processor_set_t starting_pset,processor_t chosen_processor,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)832 rt_choose_next_processor_for_spill_IPI(
833 processor_set_t starting_pset,
834 processor_t chosen_processor,
835 processor_t *result_processor,
836 sched_ipi_type_t *result_ipi_type
837 )
838 {
839 assert3p(starting_pset, !=, PROCESSOR_SET_NULL);
840 assert3p(chosen_processor, !=, PROCESSOR_NULL);
841
842 uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
843 int max_pri = rt_runq_priority(starting_pset);
844 __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
845 processor_set_t pset = starting_pset; /* lock is held on this pset */
846 processor_t next_rt_processor = PROCESSOR_NULL;
847 /* Optimization so caller can avoid unnecessary lock-takes if there are no psets eligible for spill: */
848 bool starting_pset_was_unlocked = false;
849
850 cpumap_t candidate_map = ~BIT(starting_pset->pset_id); /* exclude stealing_pset */
851 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
852 while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, candidate_map, &istate)) {
853 assert3u(starting_pset->pset_id, !=, istate.spis_pset_id);
854 /* Check for null pset because we do not use acquire/release semantics for spill order. */
855 processor_set_t nset = pset_array[istate.spis_pset_id];
856 if (__improbable(nset == PROCESSOR_SET_NULL)) {
857 continue;
858 }
859
860 /* Make sure the pset is allowed to steal threads from stealing_pset's runqueue. */
861 sched_clutch_edge edge = sched_rt_config_get((pset_id_t) starting_pset->pset_id, (pset_id_t) istate.spis_pset_id);
862 if (istate.spis_pset_id != starting_pset->pset_id && edge.sce_steal_allowed == false) {
863 continue;
864 }
865 pset = change_locked_pset(pset, nset);
866 starting_pset_was_unlocked = true;
867
868 next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true);
869 if (next_rt_processor != PROCESSOR_NULL) {
870 break;
871 }
872 }
873
874 if (next_rt_processor != PROCESSOR_NULL) {
875 if (pset != starting_pset) {
876 if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
877 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
878 next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, spill_tid);
879 }
880 }
881 *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
882 *result_processor = next_rt_processor;
883 }
884 if (starting_pset_was_unlocked) {
885 pset_unlock(pset);
886 return true;
887 } else {
888 return false;
889 }
890 }
891 #endif /* CONFIG_SCHED_EDGE */
892
893 bool
rt_pset_needs_a_followup_IPI(processor_set_t pset)894 rt_pset_needs_a_followup_IPI(processor_set_t pset)
895 {
896 int nbackup_cpus = 0;
897
898 if (rt_runq_is_low_latency(pset)) {
899 nbackup_cpus = sched_rt_n_backup_processors;
900 }
901
902 int rt_rq_count = rt_runq_count(pset);
903
904 return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
905 }
906
907 /*
908 * Returns the next processor to IPI as a followup for low-latency realtime
909 * threads on the runqueue.
910 *
911 * pset should be locked, and stays locked the whole time.
912 */
913 void
rt_choose_next_processor_for_followup_IPI(processor_set_t pset,processor_t chosen_processor,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)914 rt_choose_next_processor_for_followup_IPI(
915 processor_set_t pset,
916 processor_t chosen_processor,
917 processor_t *result_processor,
918 sched_ipi_type_t *result_ipi_type)
919 {
920 uint64_t earliest_deadline = rt_runq_earliest_deadline(pset);
921 int max_pri = rt_runq_priority(pset);
922 processor_t next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true);
923 if (next_rt_processor != PROCESSOR_NULL) {
924 *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
925 *result_processor = next_rt_processor;
926 }
927 }
928
929 #if CONFIG_SCHED_SMT
930 extern int sched_avoid_cpu0;
931 extern int sched_allow_rt_smt;
932
933 /* pset is locked */
934 processor_t
pset_choose_processor_for_realtime_thread_smt(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)935 pset_choose_processor_for_realtime_thread_smt(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
936 {
937 #if defined(__x86_64__)
938 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
939 #else
940 const bool avoid_cpu0 = false;
941 #endif
942 cpumap_t cpu_map;
943
944 try_again:
945 cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
946 if (skip_processor) {
947 bit_clear(cpu_map, skip_processor->cpu_id);
948 }
949 if (skip_spills) {
950 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
951 }
952
953 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
954 bit_clear(cpu_map, 0);
955 }
956
957 cpumap_t primary_map = cpu_map & pset->primary_map;
958 if (avoid_cpu0) {
959 primary_map = bit_ror64(primary_map, 1);
960 }
961
962 int rotid = lsb_first(primary_map);
963 if (rotid >= 0) {
964 int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
965
966 processor_t processor = processor_array[cpuid];
967
968 return processor;
969 }
970
971 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
972 goto out;
973 }
974
975 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
976 /* Also avoid cpu1 */
977 bit_clear(cpu_map, 1);
978 }
979
980 /* Consider secondary processors whose primary is actually running a realtime thread */
981 cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
982 if (avoid_cpu0) {
983 /* Also avoid cpu1 */
984 secondary_map = bit_ror64(secondary_map, 2);
985 }
986 rotid = lsb_first(secondary_map);
987 if (rotid >= 0) {
988 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
989
990 processor_t processor = processor_array[cpuid];
991
992 return processor;
993 }
994
995 /* Consider secondary processors */
996 secondary_map = cpu_map & ~pset->primary_map;
997 if (avoid_cpu0) {
998 /* Also avoid cpu1 */
999 secondary_map = bit_ror64(secondary_map, 2);
1000 }
1001 rotid = lsb_first(secondary_map);
1002 if (rotid >= 0) {
1003 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
1004
1005 processor_t processor = processor_array[cpuid];
1006
1007 return processor;
1008 }
1009
1010 /*
1011 * I was hoping the compiler would optimize
1012 * this away when avoid_cpu0 is const bool false
1013 * but it still complains about the assignmnent
1014 * in that case.
1015 */
1016 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
1017 #if defined(__x86_64__)
1018 avoid_cpu0 = false;
1019 #else
1020 assert(0);
1021 #endif
1022 goto try_again;
1023 }
1024
1025 out:
1026 if (skip_processor) {
1027 return PROCESSOR_NULL;
1028 }
1029
1030 /*
1031 * If we didn't find an obvious processor to choose, but there are still more CPUs
1032 * not already running realtime threads than realtime threads in the realtime run queue,
1033 * this thread belongs in this pset, so choose some other processor in this pset
1034 * to ensure the thread is enqueued here.
1035 */
1036 cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
1037 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
1038 cpu_map = non_realtime_map;
1039 assert(cpu_map != 0);
1040 int cpuid = bit_first(cpu_map);
1041 assert(cpuid >= 0);
1042 return processor_array[cpuid];
1043 }
1044
1045 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
1046 goto skip_secondaries;
1047 }
1048
1049 non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
1050 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
1051 cpu_map = non_realtime_map;
1052 assert(cpu_map != 0);
1053 int cpuid = bit_first(cpu_map);
1054 assert(cpuid >= 0);
1055 return processor_array[cpuid];
1056 }
1057
1058 skip_secondaries:
1059 return PROCESSOR_NULL;
1060 }
1061 #else /* !CONFIG_SCHED_SMT*/
1062 /* pset is locked */
1063 processor_t
pset_choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool skip_spills)1064 pset_choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills)
1065 {
1066 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
1067 if (skip_processor) {
1068 bit_clear(cpu_map, skip_processor->cpu_id);
1069 }
1070 if (skip_spills) {
1071 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
1072 }
1073
1074 int rotid = lsb_first(cpu_map);
1075 if (rotid >= 0) {
1076 return processor_array[rotid];
1077 }
1078
1079 /*
1080 * If we didn't find an obvious processor to choose, but there are still more CPUs
1081 * not already running realtime threads than realtime threads in the realtime run queue,
1082 * this thread belongs in this pset, so choose some other processor in this pset
1083 * to ensure the thread is enqueued here.
1084 */
1085 cpumap_t non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
1086 if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
1087 cpu_map = non_realtime_map;
1088 assert(cpu_map != 0);
1089 int cpuid = bit_first(cpu_map);
1090 assert(cpuid >= 0);
1091 return processor_array[cpuid];
1092 }
1093
1094 return PROCESSOR_NULL;
1095 }
1096 #endif /* !CONFIG_SCHED_SMT */
1097
1098 /*
1099 * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
1100 * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
1101 *
1102 * pset is locked.
1103 */
1104 processor_t
pset_choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)1105 pset_choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
1106 {
1107 uint64_t furthest_deadline = rt_deadline_add(minimum_deadline, rt_deadline_epsilon);
1108 processor_t fd_processor = PROCESSOR_NULL;
1109 int lowest_priority = max_pri;
1110
1111 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
1112 if (skip_processor) {
1113 bit_clear(cpu_map, skip_processor->cpu_id);
1114 }
1115 if (skip_spills) {
1116 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
1117 }
1118
1119 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
1120 processor_t processor = processor_array[cpuid];
1121
1122 if (processor->current_pri > lowest_priority) {
1123 continue;
1124 }
1125
1126 if (processor->current_pri < lowest_priority) {
1127 lowest_priority = processor->current_pri;
1128 furthest_deadline = processor->deadline;
1129 fd_processor = processor;
1130 continue;
1131 }
1132
1133 if (processor->deadline > furthest_deadline) {
1134 furthest_deadline = processor->deadline;
1135 fd_processor = processor;
1136 }
1137 }
1138
1139 if (fd_processor) {
1140 return fd_processor;
1141 }
1142
1143 /*
1144 * There is a race condition possible when there are multiple processor sets.
1145 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
1146 * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
1147 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
1148 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
1149 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
1150 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
1151 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
1152 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
1153 *
1154 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
1155 * on the run queue of that pset.
1156 */
1157 if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
1158 cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
1159 assert(skip_processor == PROCESSOR_NULL);
1160 assert(skip_spills == false);
1161
1162 for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
1163 processor_t processor = processor_array[cpuid];
1164
1165 if (processor->current_pri > lowest_priority) {
1166 continue;
1167 }
1168
1169 if (processor->current_pri < lowest_priority) {
1170 lowest_priority = processor->current_pri;
1171 furthest_deadline = processor->deadline;
1172 fd_processor = processor;
1173 continue;
1174 }
1175
1176 if (processor->deadline > furthest_deadline) {
1177 furthest_deadline = processor->deadline;
1178 fd_processor = processor;
1179 }
1180 }
1181 }
1182
1183 return fd_processor;
1184 }
1185
1186 bool
rt_clear_pending_spill(processor_t processor,int reason)1187 rt_clear_pending_spill(processor_t processor, int reason)
1188 {
1189 processor_set_t pset = processor->processor_set;
1190 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
1191 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, reason);
1192 return true;
1193 } else {
1194 return false;
1195 }
1196 }
1197
1198 #pragma mark - Realtime Runqueues
1199
1200 #if DEBUG || SCHED_TEST_HARNESS
1201 void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)1202 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
1203 {
1204 bitmap_t *map = rt_run_queue->bitmap;
1205
1206 uint64_t earliest_deadline = RT_DEADLINE_NONE;
1207 uint32_t constraint = RT_CONSTRAINT_NONE;
1208 int ed_index = NOPRI;
1209 int count = 0;
1210 bool found_thread = false;
1211
1212 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
1213 int i = pri - BASEPRI_RTQUEUES;
1214 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1215 queue_t queue = &rt_runq->pri_queue;
1216 queue_entry_t iter;
1217 int n = 0;
1218 uint64_t previous_deadline = 0;
1219 qe_foreach(iter, queue) {
1220 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
1221 assert_thread_magic(iter_thread);
1222 if (iter_thread == thread) {
1223 found_thread = true;
1224 }
1225 assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
1226 assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
1227 assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
1228 assert(previous_deadline <= iter_thread->realtime.deadline);
1229 n++;
1230 if (iter == queue_first(queue)) {
1231 assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
1232 assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
1233 }
1234 previous_deadline = iter_thread->realtime.deadline;
1235 }
1236 assert(n == rt_runq->pri_count);
1237 if (n == 0) {
1238 assert(bitmap_test(map, i) == false);
1239 assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
1240 assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
1241 } else {
1242 assert(bitmap_test(map, i) == true);
1243 }
1244 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
1245 earliest_deadline = rt_runq->pri_earliest_deadline;
1246 constraint = rt_runq->pri_constraint;
1247 ed_index = i;
1248 }
1249 count += n;
1250 }
1251 assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
1252 assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
1253 assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
1254 assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
1255 if (thread) {
1256 assert(found_thread);
1257 }
1258 }
1259 #endif /* DEBUG || SCHED_TEST_HARNESS */
1260
1261 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)1262 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
1263 {
1264 int pri = thread->sched_pri;
1265 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
1266 int i = pri - BASEPRI_RTQUEUES;
1267 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1268 bitmap_t *map = rt_run_queue->bitmap;
1269
1270 bitmap_set(map, i);
1271
1272 queue_t queue = &rt_runq->pri_queue;
1273 uint64_t deadline = thread->realtime.deadline;
1274 bool preempt = false;
1275 bool earliest = false;
1276
1277 if (queue_empty(queue)) {
1278 enqueue_tail(queue, &thread->runq_links);
1279 preempt = true;
1280 earliest = true;
1281 rt_runq->pri_earliest_deadline = deadline;
1282 rt_runq->pri_constraint = thread->realtime.constraint;
1283 } else {
1284 /* Insert into rt_runq in thread deadline order */
1285 queue_entry_t iter;
1286 qe_foreach(iter, queue) {
1287 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
1288 assert_thread_magic(iter_thread);
1289
1290 if (deadline < iter_thread->realtime.deadline) {
1291 if (iter == queue_first(queue)) {
1292 preempt = true;
1293 earliest = true;
1294 rt_runq->pri_earliest_deadline = deadline;
1295 rt_runq->pri_constraint = thread->realtime.constraint;
1296 }
1297 insque(&thread->runq_links, queue_prev(iter));
1298 break;
1299 } else if (iter == queue_last(queue)) {
1300 enqueue_tail(queue, &thread->runq_links);
1301 break;
1302 }
1303 }
1304 }
1305 if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
1306 os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
1307 os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
1308 os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
1309 }
1310
1311 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
1312 rt_runq->pri_count++;
1313 os_atomic_inc(&rt_run_queue->count, relaxed);
1314
1315 thread_set_runq_locked(thread, processor);
1316
1317 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
1318
1319 return preempt;
1320 }
1321
1322 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)1323 rt_runq_earliest_deadline(processor_set_t pset)
1324 {
1325 return os_atomic_load_wide(&pset->rt_runq.earliest_deadline, relaxed);
1326 }
1327
1328 /*
1329 * rt_runq_insert:
1330 *
1331 * Enqueue a thread for realtime execution.
1332 */
1333 bool
rt_runq_insert(processor_t processor,processor_set_t pset,thread_t thread)1334 rt_runq_insert(processor_t processor, processor_set_t pset, thread_t thread)
1335 {
1336 pset_assert_locked(pset);
1337
1338 bool preempt = rt_runq_enqueue(&pset->rt_runq, thread, processor);
1339 pset_update_rt_stealable_state(pset);
1340
1341 return preempt;
1342 }
1343
1344 int
rt_runq_count(processor_set_t pset)1345 rt_runq_count(processor_set_t pset)
1346 {
1347 return os_atomic_load(&pset->rt_runq.count, relaxed);
1348 }
1349
1350 int
rt_runq_priority(processor_set_t pset)1351 rt_runq_priority(processor_set_t pset)
1352 {
1353 pset_assert_locked(pset);
1354 rt_queue_t rt_run_queue = &pset->rt_runq;
1355
1356 bitmap_t *map = rt_run_queue->bitmap;
1357 int i = bitmap_first(map, NRTQS);
1358 assert(i < NRTQS);
1359
1360 if (i >= 0) {
1361 return i + BASEPRI_RTQUEUES;
1362 }
1363
1364 return i;
1365 }
1366
1367 bool
rt_runq_is_low_latency(processor_set_t pset)1368 rt_runq_is_low_latency(processor_set_t pset)
1369 {
1370 return os_atomic_load(&pset->rt_runq.constraint, relaxed) <= rt_constraint_threshold;
1371 }
1372
1373 thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)1374 rt_runq_dequeue(rt_queue_t rt_run_queue)
1375 {
1376 bitmap_t *map = rt_run_queue->bitmap;
1377 int i = bitmap_first(map, NRTQS);
1378 assert((i >= 0) && (i < NRTQS));
1379
1380 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1381
1382 if (!sched_rt_runq_strict_priority) {
1383 int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
1384 if (ed_index != i) {
1385 assert((ed_index >= 0) && (ed_index < NRTQS));
1386 rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
1387
1388 thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
1389 thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1390
1391 if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
1392 /* choose the earliest deadline thread */
1393 rt_runq = ed_runq;
1394 i = ed_index;
1395 }
1396 }
1397 }
1398
1399 assert(rt_runq->pri_count > 0);
1400 uint64_t earliest_deadline = RT_DEADLINE_NONE;
1401 uint32_t constraint = RT_CONSTRAINT_NONE;
1402 int ed_index = NOPRI;
1403 thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
1404 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
1405 if (--rt_runq->pri_count > 0) {
1406 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1407 assert(next_rt != THREAD_NULL);
1408 earliest_deadline = next_rt->realtime.deadline;
1409 constraint = next_rt->realtime.constraint;
1410 ed_index = i;
1411 } else {
1412 bitmap_clear(map, i);
1413 }
1414 rt_runq->pri_earliest_deadline = earliest_deadline;
1415 rt_runq->pri_constraint = constraint;
1416
1417 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
1418 rt_runq = &rt_run_queue->rt_queue_pri[i];
1419 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
1420 earliest_deadline = rt_runq->pri_earliest_deadline;
1421 constraint = rt_runq->pri_constraint;
1422 ed_index = i;
1423 }
1424 }
1425 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
1426 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
1427 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
1428 os_atomic_dec(&rt_run_queue->count, relaxed);
1429
1430 thread_clear_runq(new_thread);
1431
1432 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
1433
1434 return new_thread;
1435 }
1436
1437 thread_t
rt_runq_first(rt_queue_t rt_run_queue)1438 rt_runq_first(rt_queue_t rt_run_queue)
1439 {
1440 bitmap_t *map = rt_run_queue->bitmap;
1441 int i = bitmap_first(map, NRTQS);
1442 if (i < 0) {
1443 return THREAD_NULL;
1444 }
1445 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1446 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1447
1448 return next_rt;
1449 }
1450
1451 void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)1452 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
1453 {
1454 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
1455
1456 int pri = thread->sched_pri;
1457 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
1458 int i = pri - BASEPRI_RTQUEUES;
1459 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1460 bitmap_t *map = rt_run_queue->bitmap;
1461
1462 assert(rt_runq->pri_count > 0);
1463 uint64_t earliest_deadline = RT_DEADLINE_NONE;
1464 uint32_t constraint = RT_CONSTRAINT_NONE;
1465 int ed_index = NOPRI;
1466 remqueue(&thread->runq_links);
1467 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
1468 if (--rt_runq->pri_count > 0) {
1469 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1470 earliest_deadline = next_rt->realtime.deadline;
1471 constraint = next_rt->realtime.constraint;
1472 ed_index = i;
1473 } else {
1474 bitmap_clear(map, i);
1475 }
1476 rt_runq->pri_earliest_deadline = earliest_deadline;
1477 rt_runq->pri_constraint = constraint;
1478
1479 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
1480 rt_runq = &rt_run_queue->rt_queue_pri[i];
1481 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
1482 earliest_deadline = rt_runq->pri_earliest_deadline;
1483 constraint = rt_runq->pri_constraint;
1484 ed_index = i;
1485 }
1486 }
1487 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
1488 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
1489 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
1490 os_atomic_dec(&rt_run_queue->count, relaxed);
1491
1492 thread_clear_runq_locked(thread);
1493
1494 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
1495 }
1496