xref: /xnu-12377.81.4/osfmk/kern/sched_rt.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <stdint.h>
30 
31 #include <kern/bits.h>
32 #include <kern/kern_types.h>
33 #include <kern/processor.h>
34 #include <kern/sched_clutch.h>
35 #include <kern/sched_common.h>
36 #include <kern/sched_prim.h>
37 #include <kern/sched_rt.h>
38 #include <kern/thread.h>
39 #include <kern/queue.h>
40 
41 #include <sys/kdebug_kernel.h>
42 
43 #include <os/atomic_private.h>
44 
45 #include <machine/machine_routines.h>
46 
47 #define KTRC KDBG_RELEASE
48 
49 #pragma mark - Constants and Tunables
50 
51 #include <kern/startup.h>
52 
53 /*
54  * Tunables controlling how xnu initializes the realtime matrix. CLPC can
55  * override their effects with sched_perfcontrol interfaces.
56  */
57 
58 TUNABLE(unsigned int, sched_rt_spill_policy, "sched_rt_spill_policy", 1);
59 
60 TUNABLE(unsigned, sched_rt_steal_policy, "sched_rt_steal_policy", 2);
61 
62 uint32_t rt_deadline_epsilon;
63 uint32_t rt_constraint_threshold;
64 /* epsilon for comparing RT deadlines */
65 int rt_deadline_epsilon_us = 100;
66 uint32_t max_rt_quantum;
67 uint32_t min_rt_quantum;
68 int sched_allow_rt_smt = 1;
69 int sched_rt_runq_strict_priority = false;
70 
71 int
sched_get_rt_deadline_epsilon(void)72 sched_get_rt_deadline_epsilon(void)
73 {
74 	return rt_deadline_epsilon_us;
75 }
76 
77 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)78 sched_set_rt_deadline_epsilon(int new_epsilon_us)
79 {
80 	rt_deadline_epsilon_us = new_epsilon_us;
81 
82 	uint64_t abstime;
83 	clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
84 	assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
85 	rt_deadline_epsilon = (uint32_t)abstime;
86 }
87 
88 #pragma mark - Initialization
89 
90 void
sched_realtime_timebase_init(void)91 sched_realtime_timebase_init(void)
92 {
93 	uint64_t abstime;
94 
95 	/* smallest rt computation (50 us) */
96 	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
97 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
98 	min_rt_quantum = (uint32_t)abstime;
99 
100 	/* maximum rt computation (50 ms) */
101 	clock_interval_to_absolutetime_interval(
102 		50, 1000 * NSEC_PER_USEC, &abstime);
103 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
104 	max_rt_quantum = (uint32_t)abstime;
105 
106 	/* constraint threshold for sending backup IPIs (4 ms) */
107 	clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
108 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
109 	rt_constraint_threshold = (uint32_t)abstime;
110 
111 	/* epsilon for comparing deadlines */
112 	sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
113 }
114 
115 #if CONFIG_SCHED_EDGE
116 /* forward-declare config utility */
117 static void
118 sched_rt_config_pset_push(processor_set_t pset);
119 #endif /* CONFIG_SCHED_EDGE */
120 
121 static void
rt_init_completed(void)122 rt_init_completed(void)
123 {
124 	/* Realtime spill/steal are only supported on platforms with the edge scheduler. */
125 #if CONFIG_SCHED_EDGE
126 	/* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */
127 	spl_t s = splsched();
128 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
129 	for (pset_id_t src_pset_id = 0; src_pset_id < sched_num_psets; src_pset_id++) {
130 		processor_set_t src_pset = pset_array[src_pset_id];
131 		assert3p(src_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */
132 
133 		/* For each cluster, set all its outgoing edge parameters */
134 		for (pset_id_t dst_pset_id = 0; dst_pset_id < sched_num_psets; dst_pset_id++) {
135 			if (dst_pset_id == src_pset_id) {
136 				continue;
137 			}
138 			processor_set_t dst_pset = pset_array[dst_pset_id];
139 			assert3p(dst_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */
140 
141 			bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type);
142 			if (clusters_homogenous) {
143 				/* Default realtime policy: spill allowed among homogeneous psets. */
144 				sched_rt_config_set((pset_id_t) src_pset_id, (pset_id_t) dst_pset_id, (sched_clutch_edge) {
145 					.sce_migration_allowed = true,
146 					.sce_steal_allowed = true,
147 					.sce_migration_weight = 0,
148 				});
149 			} else {
150 				/* Default realtime policy: disallow spill among heterogeneous psets. */
151 				sched_rt_config_set((pset_id_t) src_pset_id, (pset_id_t) dst_pset_id, (sched_clutch_edge) {
152 					.sce_migration_allowed = false,
153 					.sce_steal_allowed = false,
154 					.sce_migration_weight = 0,
155 				});
156 			}
157 		}
158 	}
159 
160 
161 	for (pset_id_t pset_id = 0; pset_id < sched_num_psets; pset_id++) {
162 		sched_rt_config_pset_push(pset_array[pset_id]);
163 	}
164 
165 	simple_unlock(&sched_available_cores_lock);
166 	splx(s);
167 #endif /* CONFIG_SCHED_EDGE */
168 }
169 
170 static void
pset_rt_init(processor_set_t pset)171 pset_rt_init(processor_set_t pset)
172 {
173 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
174 		int i = pri - BASEPRI_RTQUEUES;
175 		rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
176 		queue_init(&rqi->pri_queue);
177 		rqi->pri_count = 0;
178 		rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
179 		rqi->pri_constraint = RT_CONSTRAINT_NONE;
180 	}
181 	os_atomic_init(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE);
182 
183 	rt_queue_t rt_runq = &pset->rt_runq;
184 	os_atomic_init(&rt_runq->count, 0);
185 	os_atomic_init(&rt_runq->earliest_deadline, RT_DEADLINE_NONE);
186 	os_atomic_init(&rt_runq->constraint, RT_CONSTRAINT_NONE);
187 	os_atomic_init(&rt_runq->ed_index, NOPRI);
188 	bzero(&rt_runq->bitmap, sizeof(rt_runq->bitmap));
189 	bzero(&rt_runq->runq_stats, sizeof(rt_runq->runq_stats));
190 
191 #if __AMP__
192 	/*
193 	 * Initialize spill/steal search orders as invalid to prevent spill/steal
194 	 * before the matrix is configured.
195 	 */
196 	bzero(pset->sched_rt_edges, sizeof(pset->sched_rt_edges));
197 	for (pset_id_t i = 0; i < MAX_PSETS - 1; i++) {
198 		pset->sched_rt_spill_search_order.spso_search_order[i] = PSET_ID_INVALID;
199 #if CONFIG_SCHED_EDGE
200 		pset->sched_rt_steal_search_order.spso_search_order[i] = PSET_ID_INVALID;
201 #endif /* CONFIG_SCHED_EDGE */
202 	}
203 #endif /* __AMP__ */
204 }
205 
206 #pragma mark - Realtime Scheduler/CLPC interface
207 
208 #if CONFIG_SCHED_EDGE
209 void
sched_rt_config_set(uint8_t src_pset,uint8_t dst_pset,sched_clutch_edge edge_config)210 sched_rt_config_set(
211 	uint8_t src_pset,
212 	uint8_t dst_pset,
213 	sched_clutch_edge edge_config)
214 {
215 	assert(src_pset != dst_pset || !edge_config.sce_migration_allowed); /* No self-edges. */
216 	os_atomic_store(&pset_array[src_pset]->sched_rt_edges[dst_pset], edge_config, relaxed);
217 }
218 
219 sched_clutch_edge
sched_rt_config_get(uint8_t src_pset,uint8_t dst_pset)220 sched_rt_config_get(
221 	uint8_t src_pset,
222 	uint8_t dst_pset)
223 {
224 	return os_atomic_load(&pset_array[src_pset]->sched_rt_edges[dst_pset], relaxed);
225 }
226 
227 void
sched_rt_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_requests,uint64_t num_psets)228 sched_rt_matrix_get(
229 	sched_clutch_edge *edge_matrix,
230 	bool *edge_requests,
231 	uint64_t num_psets)
232 {
233 	uint64_t edge_index = 0;
234 	for (uint8_t src_pset = 0; src_pset < num_psets; src_pset++) {
235 		for (uint8_t dst_pset = 0; dst_pset < num_psets; dst_pset++) {
236 			if (edge_requests[edge_index]) {
237 				edge_matrix[edge_index] = sched_rt_config_get(src_pset, dst_pset);
238 			}
239 			edge_index++;
240 		}
241 	}
242 }
243 
244 /*
245  * sched_rt_config_pset_push()
246  *
247  * After using sched_rt_config_set() to update edge tunables outgoing from a particular source
248  * pset, this function should be called in order to propagate the updates to derived metadata for
249  * the pset, such as search orders for outgoing spill and steal.
250  */
251 static void
sched_rt_config_pset_push(processor_set_t pset)252 sched_rt_config_pset_push(processor_set_t pset)
253 {
254 	assert3u(pset->pset_id, <, UINT8_MAX);
255 
256 	sched_pset_search_order_sort_data_t spill_datas[MAX_PSETS - 1], steal_datas[MAX_PSETS - 1];
257 	uint num_spill_datas = 0, num_steal_datas = 0;
258 	for (pset_id_t other_pset_id = 0; other_pset_id < sched_num_psets; other_pset_id++) {
259 		if (pset->pset_id == other_pset_id) {
260 			continue; /* No self-edges. */
261 		}
262 		/* Spill */
263 		sched_clutch_edge out_edge = sched_rt_config_get((pset_id_t)pset->pset_cluster_id, other_pset_id);
264 		if (out_edge.sce_migration_allowed) {
265 			spill_datas[num_spill_datas++] = (sched_pset_search_order_sort_data_t) {
266 				.spsosd_src_pset = pset,
267 				.spsosd_migration_weight = out_edge.sce_migration_weight,
268 				.spsosd_dst_pset_id = other_pset_id
269 			};
270 		}
271 		/* Steal */
272 		sched_clutch_edge in_edge = sched_rt_config_get(other_pset_id, (pset_id_t)pset->pset_cluster_id);
273 		if (in_edge.sce_steal_allowed) {
274 			steal_datas[num_steal_datas++] = (sched_pset_search_order_sort_data_t) {
275 				.spsosd_src_pset = pset,
276 				.spsosd_migration_weight = in_edge.sce_migration_weight,
277 				.spsosd_dst_pset_id = other_pset_id,
278 			};
279 		}
280 	}
281 	sched_pset_search_order_compute(&pset->sched_rt_spill_search_order, spill_datas, num_spill_datas, sched_edge_search_order_weight_then_locality_cmp);
282 	sched_pset_search_order_compute(&pset->sched_rt_steal_search_order, steal_datas, num_steal_datas, sched_edge_search_order_weight_then_locality_cmp);
283 }
284 
285 void
sched_rt_matrix_set(sched_clutch_edge * rt_matrix,bool * edge_changes,uint64_t num_psets)286 sched_rt_matrix_set(
287 	sched_clutch_edge *rt_matrix,
288 	bool *edge_changes,
289 	uint64_t num_psets)
290 {
291 	/* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */
292 	spl_t s = splsched();
293 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
294 
295 	for (uint8_t src_pset_id = 0; src_pset_id < num_psets; src_pset_id++) {
296 		for (uint8_t dst_pset_id = 0; dst_pset_id < num_psets; dst_pset_id++) {
297 			const uint64_t rt_matrix_index = src_pset_id * num_psets + dst_pset_id;
298 			if (edge_changes[rt_matrix_index]) {
299 				sched_rt_config_set(src_pset_id, dst_pset_id, rt_matrix[rt_matrix_index]);
300 			}
301 		}
302 	}
303 
304 	for (pset_id_t pset_id = 0; pset_id < num_psets; pset_id++) {
305 		sched_rt_config_pset_push(pset_array[pset_id]);
306 	}
307 
308 	simple_unlock(&sched_available_cores_lock);
309 	splx(s);
310 }
311 #endif /* CONFIG_SCHED_EDGE */
312 
313 #pragma mark - Scheduler Callouts
314 
315 #if CONFIG_SCHED_SMT
316 /*
317  * SMT-aware callout for rt_choose_processor.
318  */
319 processor_t
sched_rtlocal_choose_processor_smt(processor_set_t starting_pset,processor_t processor,thread_t thread)320 sched_rtlocal_choose_processor_smt(
321 	processor_set_t         starting_pset,
322 	processor_t             processor,
323 	thread_t                thread)
324 {
325 	processor_set_t nset = PROCESSOR_SET_NULL;
326 	processor_set_t pset = starting_pset;
327 	pset_node_t node = pset->node;
328 
329 	processor_t lc_processor = processor;
330 	integer_t lowest_count = INT_MAX;
331 	if (lc_processor != PROCESSOR_NULL) {
332 		lowest_count = SCHED(processor_runq_count)(processor);
333 	}
334 
335 	bool include_ast_urgent_pending_cpus = false;
336 	cpumap_t ast_urgent_pending;
337 try_again:
338 	ast_urgent_pending = 0;
339 	int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
340 	for (; consider_secondaries < 2; consider_secondaries++) {
341 		pset = change_locked_pset(pset, starting_pset);
342 		do {
343 			cpumap_t available_map = pset_available_cpumap(pset);
344 			if (available_map == 0) {
345 				goto no_available_cpus;
346 			}
347 
348 			processor = pset_choose_processor_for_realtime_thread_smt(pset, PROCESSOR_NULL, consider_secondaries, false);
349 			if (processor) {
350 				return processor;
351 			}
352 
353 			if (consider_secondaries) {
354 				processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
355 				if (processor) {
356 					/*
357 					 * Instead of looping through all the psets to find the global
358 					 * furthest deadline processor, preempt the first candidate found.
359 					 * The preempted thread will then find any other available far deadline
360 					 * processors to preempt.
361 					 */
362 					return processor;
363 				}
364 
365 				ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
366 
367 				if (rt_runq_count(pset) < lowest_count) {
368 					int cpuid = bit_first(available_map);
369 					assert(cpuid >= 0);
370 					lc_processor = processor_array[cpuid];
371 					lowest_count = rt_runq_count(pset);
372 				}
373 			}
374 
375 no_available_cpus:
376 			nset = next_pset(pset);
377 
378 			if (nset != starting_pset) {
379 				pset = change_locked_pset(pset, nset);
380 			}
381 		} while (nset != starting_pset);
382 	}
383 
384 	/* Short cut for single pset nodes */
385 	if (bit_count(node->pset_map) == 1) {
386 		if (lc_processor) {
387 			pset_assert_locked(lc_processor->processor_set);
388 			return lc_processor;
389 		}
390 	} else {
391 		if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
392 			/* See the comment in pset_choose_furthest_deadline_processor_for_realtime_thread() */
393 			include_ast_urgent_pending_cpus = true;
394 			goto try_again;
395 		}
396 	}
397 
398 	processor = lc_processor;
399 
400 	if (processor) {
401 		pset = change_locked_pset(pset, processor->processor_set);
402 		/* Check that chosen processor is still usable */
403 		cpumap_t available_map = pset_available_cpumap(pset);
404 		if (bit_test(available_map, processor->cpu_id)) {
405 			return processor;
406 		}
407 
408 		/* processor is no longer usable */
409 		processor = PROCESSOR_NULL;
410 	}
411 
412 	pset_assert_locked(pset);
413 	pset_unlock(pset);
414 	return PROCESSOR_NULL;
415 }
416 #else /* !CONFIG_SCHED_SMT */
417 /*
418  * Called with thread and starting_pset locked. The returned processor's pset is
419  * locked on return.
420  */
421 processor_t
sched_rt_choose_processor(const processor_set_t starting_pset,processor_t processor,thread_t thread)422 sched_rt_choose_processor(
423 	const processor_set_t starting_pset,
424 	processor_t processor,
425 	thread_t thread)
426 {
427 	assert3u(thread->sched_pri, >=, BASEPRI_RTQUEUES);
428 	assert3u(thread->sched_pri, <=, MAXPRI);
429 
430 	/*
431 	 * In choose_starting_pset, we found a good candidate pset for this thread.
432 	 * Now, we pick the best processor to preempt, if there is one.  It is also
433 	 * possible that conditions have changed and the thread should spill to
434 	 * another pset.
435 	 */
436 
437 	processor_set_t pset = starting_pset; /* Lock is held on this pset. */
438 	pset_assert_locked(pset);
439 
440 #if __AMP__
441 	/*
442 	 * If there are processors with outstanding urgent preemptions, we consider
443 	 * them in a second pass. While we are changing pset locks here, it is
444 	 * possible a processor may resolve its outstanding urgent preemption and
445 	 * become eligible to run this thread. See comment in
446 	 * pset_choose_furthest_deadline_processor_for_realtime_thread().
447 	 */
448 	bool found_ast_urgent_pending = false; /* Tracks whether any (eligible) processors have pending urgent ASTs. */
449 	for (int include_ast_urgent_pending_cpus = 0; include_ast_urgent_pending_cpus < 2; include_ast_urgent_pending_cpus++) {
450 		if (include_ast_urgent_pending_cpus && !found_ast_urgent_pending) {
451 			break; /* Skip the second pass. */
452 		}
453 
454 		sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
455 		while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, ~0, &istate)) {
456 			/* Switch to the next pset. We need to check for null psets because
457 			 * we do not use acquire/release semantics for the spill order. */
458 			processor_set_t nset = pset_array[istate.spis_pset_id];
459 			if (__improbable(nset == PROCESSOR_SET_NULL)) {
460 				continue;
461 			}
462 			pset = change_locked_pset(pset, nset);
463 
464 			processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false);
465 			if (processor != PROCESSOR_NULL) {
466 				/* We found a candidate processor on this pset to wake or preempt. */
467 				pset_assert_locked(processor->processor_set);
468 				return processor;
469 			}
470 
471 			/* TODO <rdar://140219824>: Policy question of EDF vs targeting idle cores on another pset. */
472 			processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
473 			if (processor) {
474 				/*
475 				 * Instead of looping through all the psets to find the global
476 				 * furthest deadline processor, preempt the first candidate found.
477 				 * The preempted thread will then find any other available far deadline
478 				 * processors to preempt.
479 				 */
480 				pset_assert_locked(processor->processor_set);
481 				return processor;
482 			}
483 
484 			found_ast_urgent_pending = found_ast_urgent_pending || (pset->pending_AST_URGENT_cpu_mask != 0);
485 		}
486 	}
487 
488 	/*
489 	 * There was no obvious (idle or non-realtime) processor to run the thread.
490 	 * Instead, do EDF scheduling again on starting_pset, putting the thread on
491 	 * the run queue if there is no processor to preempt.
492 	 */
493 
494 	pset = change_locked_pset(pset, starting_pset);
495 #endif /* __AMP__ */
496 
497 	/* Check (again, for AMP systems) that there is no lower-priority or idle processor. */
498 	processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false);
499 	if (processor != PROCESSOR_NULL) {
500 		/* We found a candidate processor on this pset to wake or preempt. */
501 		pset_assert_locked(processor->processor_set);
502 		return processor;
503 	}
504 
505 	processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, true);
506 	if (processor == PROCESSOR_NULL) {
507 		/* Choose an arbitrary available and recommended processor from the pset.
508 		 * It won't get preempted anyways, since this thread has a later
509 		 * deadline. */
510 		int processor_id = lsb_first(pset_available_cpumap(pset));
511 
512 		/* starting_pset had available, recommended processors coming into
513 		 * rt_choose_processor(), but that might have changed after dropping the
514 		 * pset lock. If there are no such processors, bail out here and let
515 		 * sched_edge_migrate_candidate() find a better starting pset. */
516 		if (processor_id < 0) {
517 			pset_unlock(pset);
518 			return PROCESSOR_NULL;
519 		}
520 
521 		processor = processor_array[processor_id];
522 	}
523 
524 	pset_assert_locked(processor->processor_set);
525 	return processor;
526 }
527 #endif /* !CONFIG_SCHED_SMT */
528 
529 #if CONFIG_SCHED_EDGE
530 /*
531  * Called with stealing_pset locked and returns with stealing_pset locked but
532  * the lock will have been dropped if a thread is returned. The lock may have
533  * been temporarily dropped, even if no thread is returned.
534  */
535 thread_t
sched_rt_steal_thread(processor_set_t stealing_pset)536 sched_rt_steal_thread(processor_set_t stealing_pset)
537 {
538 	uint64_t earliest_deadline = rt_runq_earliest_deadline(stealing_pset);
539 	processor_set_t pset = stealing_pset;
540 
541 	/* Continue searching until there are no steal candidates found in a single iteration. */
542 	while (true) {
543 		processor_set_t target_pset = NULL;
544 		uint64_t target_deadline;
545 		if (__improbable(os_sub_overflow(earliest_deadline, rt_deadline_epsilon, &target_deadline))) {
546 			target_deadline = 0;
547 		}
548 
549 		sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
550 		while (sched_iterate_psets_ordered(stealing_pset, &stealing_pset->sched_rt_steal_search_order, ~BIT(stealing_pset->pset_id), &istate)) {
551 			assert3s(istate.spis_pset_id, !=, stealing_pset->pset_id); /* stealing_pset's runqueue is drained by sched_rt_choose_processor */
552 			const processor_set_t nset = pset_array[istate.spis_pset_id];
553 			/* Check for null because we do not use acquire/release semantics for steal order. */
554 			if (__improbable(nset == PROCESSOR_SET_NULL)) {
555 				continue;
556 			}
557 			uint64_t nset_deadline = os_atomic_load(&nset->stealable_rt_threads_earliest_deadline, relaxed);
558 			if (nset_deadline < target_deadline) {
559 				target_pset = nset;
560 				target_deadline = nset_deadline;
561 			}
562 		}
563 
564 		if (target_pset != PROCESSOR_SET_NULL) {
565 			assert3u(target_deadline, !=, RT_DEADLINE_NONE);
566 
567 			/* target_pset is a candidate for steal. Check again under its pset lock. */
568 
569 			pset = change_locked_pset(pset, target_pset);
570 			if (os_atomic_load(&pset->stealable_rt_threads_earliest_deadline, relaxed) <= target_deadline) {
571 				/* Steal the next thread from target_pset's runqueue. */
572 				thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
573 				pset_update_rt_stealable_state(pset);
574 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
575 
576 				pset = change_locked_pset(pset, stealing_pset);
577 				return new_thread;
578 			} else {
579 				/* Failed to steal (another pset stole first). Try again. */
580 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(THREAD_NULL), pset->pset_id, pset->cpu_set_low, 1);
581 				pset = change_locked_pset(pset, stealing_pset);
582 				/* Update earliest_deadline in case it changed while the stealing_pset lock was not held. */
583 				earliest_deadline = rt_runq_earliest_deadline(pset);
584 				continue;
585 			}
586 		} else {
587 			/* No steal candidates, stop searching. */
588 			break;
589 		}
590 	}
591 	/* No stealable threads, return with stealing_pset locked. */
592 	pset = change_locked_pset(pset, stealing_pset);
593 	return THREAD_NULL;
594 }
595 #endif /* CONFIG_SCHED_EDGE */
596 
597 /*
598  * processor's pset is locked, may drop and retake the lock
599  */
600 thread_t
sched_rt_choose_thread(processor_t processor)601 sched_rt_choose_thread(processor_t processor)
602 {
603 	processor_set_t pset = processor->processor_set;
604 	pset_assert_locked(pset);
605 
606 	if (SCHED(rt_steal_thread) != NULL) {
607 		do {
608 			rt_clear_pending_spill(processor, 2);
609 			thread_t new_thread = SCHED(rt_steal_thread)(pset);
610 			/* pset lock may have been dropped and retaken, is currently locked */
611 			pset_assert_locked(pset);
612 			if (new_thread != THREAD_NULL) {
613 				/* Spill might have been set if the pset lock was dropped in steal. */
614 				rt_clear_pending_spill(processor, 3);
615 				return new_thread;
616 			}
617 		} while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
618 	}
619 	rt_clear_pending_spill(processor, 5);
620 
621 	if (rt_runq_count(pset) > 0) {
622 		thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
623 		assert(new_thread != THREAD_NULL);
624 		pset_update_rt_stealable_state(pset);
625 		return new_thread;
626 	}
627 
628 	return THREAD_NULL;
629 }
630 
631 void
sched_rt_init_pset(processor_set_t pset)632 sched_rt_init_pset(processor_set_t pset)
633 {
634 	pset_rt_init(pset);
635 }
636 
637 void
sched_rt_init_completed(void)638 sched_rt_init_completed(void)
639 {
640 	rt_init_completed();
641 }
642 
643 void
sched_rt_queue_shutdown(processor_t processor,struct pulled_thread_queue * threadq)644 sched_rt_queue_shutdown(processor_t processor, struct pulled_thread_queue * threadq)
645 {
646 	processor_set_t pset = processor->processor_set;
647 
648 	pset_lock(pset);
649 
650 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
651 	if (bit_count(pset_available_cpumap(pset)) > 0) {
652 		pset_unlock(pset);
653 		return;
654 	}
655 
656 	while (rt_runq_count(pset) > 0) {
657 		thread_t thread = rt_runq_dequeue(&pset->rt_runq);
658 		pulled_thread_queue_enqueue(threadq, thread);
659 	}
660 	SCHED(update_pset_load_average)(pset, 0);
661 	pset_update_rt_stealable_state(pset);
662 	pset_unlock(pset);
663 }
664 
665 /*
666  * Assumes RT lock is not held, and acquires splsched/rt_lock itself.
667  * Also records tracepoints for pset bitmasks under the pset lock.
668  */
669 void
sched_rt_runq_scan(sched_update_scan_context_t scan_context)670 sched_rt_runq_scan(sched_update_scan_context_t scan_context)
671 {
672 	thread_t        thread;
673 
674 	pset_node_t node = &pset_node0;
675 	processor_set_t pset = node->psets;
676 
677 	spl_t s = splsched();
678 	do {
679 		while (pset != NULL) {
680 			pset_lock(pset);
681 
682 			bitmap_t *map = pset->rt_runq.bitmap;
683 			for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
684 				rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
685 
686 				qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
687 					if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
688 						scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
689 					}
690 				}
691 			}
692 
693 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PSET_BITMASKS),
694 			    pset->pset_id,
695 			    pset->recommended_bitmask,
696 			    pset->perfcontrol_cpu_migration_bitmask,
697 			    pset->perfcontrol_cpu_preferred_bitmask);
698 
699 			pset_unlock(pset);
700 
701 			pset = pset->pset_list;
702 		}
703 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
704 	splx(s);
705 }
706 
707 int64_t
sched_rt_runq_count_sum(void)708 sched_rt_runq_count_sum(void)
709 {
710 	pset_node_t node = &pset_node0;
711 	processor_set_t pset = node->psets;
712 	int64_t count = 0;
713 
714 	do {
715 		while (pset != NULL) {
716 			count += pset->rt_runq.runq_stats.count_sum;
717 
718 			pset = pset->pset_list;
719 		}
720 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
721 
722 	return count;
723 }
724 
725 #pragma mark - Utilities
726 
727 uint64_t
rt_deadline_add(uint64_t d,uint64_t e)728 rt_deadline_add(uint64_t d, uint64_t e)
729 {
730 	uint64_t sum;
731 	return os_add_overflow(d, e, &sum) ? RT_DEADLINE_NONE : sum;
732 }
733 
734 cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)735 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
736 {
737 	cpumap_t avail_map = pset_available_cpumap(pset);
738 #if CONFIG_SCHED_SMT
739 	if (!sched_allow_rt_smt) {
740 		/*
741 		 * Secondary CPUs are not allowed to run RT threads, so
742 		 * only primary CPUs should be included
743 		 */
744 		avail_map &= pset->primary_map;
745 	}
746 #endif /* CONFIG_SCHED_SMT */
747 
748 	return avail_map & ~pset->realtime_map;
749 }
750 
751 /* pset is locked */
752 static processor_t
pset_choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)753 pset_choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
754 {
755 	(void) consider_secondaries;
756 	bool skip_spills = true;
757 	bool include_ast_urgent_pending_cpus = false;
758 
759 #if CONFIG_SCHED_SMT
760 	processor_t next_processor = pset_choose_processor_for_realtime_thread_smt(pset, skip_processor, consider_secondaries, skip_spills);
761 #else /* CONFIG_SCHED_SMT */
762 	processor_t next_processor = pset_choose_processor_for_realtime_thread(pset, skip_processor, skip_spills);
763 #endif /* CONFIG_SCHED_SMT */
764 	if (next_processor != PROCESSOR_NULL) {
765 		return next_processor;
766 	}
767 
768 	next_processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
769 	return next_processor;
770 }
771 
772 #if CONFIG_SCHED_EDGE
773 /*
774  * Realtime Steal Utilities
775  *
776  * Realtime steal is only supported on platforms with the edge scheduler.
777  */
778 
779 /* Update realtime stealable state. */
780 void
pset_update_rt_stealable_state(processor_set_t pset)781 pset_update_rt_stealable_state(processor_set_t pset)
782 {
783 	pset_assert_locked(pset);
784 	if (rt_pset_has_stealable_threads(pset)) {
785 		os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, rt_runq_earliest_deadline(pset), relaxed);
786 	} else {
787 		os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE, relaxed);
788 	}
789 }
790 
791 bool
rt_pset_has_stealable_threads(processor_set_t pset)792 rt_pset_has_stealable_threads(processor_set_t pset)
793 {
794 	cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
795 
796 	return rt_runq_count(pset) > bit_count(avail_map);
797 }
798 
799 /*
800  * Returns the next processor to IPI for a migrating realtime thread. Realtime
801  * spill is only supported with the edge scheduler.
802  *
803  * Expects starting_pset to be locked. Returns false if starting_pset was never
804  * unlocked; otherwise, returns true with no lock held.
805  */
806 bool
rt_choose_next_processor_for_spill_IPI(processor_set_t starting_pset,processor_t chosen_processor,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)807 rt_choose_next_processor_for_spill_IPI(
808 	processor_set_t  starting_pset,
809 	processor_t      chosen_processor,
810 	processor_t      *result_processor,
811 	sched_ipi_type_t *result_ipi_type
812 	)
813 {
814 	assert3p(starting_pset, !=, PROCESSOR_SET_NULL);
815 	assert3p(chosen_processor, !=, PROCESSOR_NULL);
816 
817 	uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
818 	int max_pri = rt_runq_priority(starting_pset);
819 	__kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
820 	processor_set_t pset = starting_pset; /* lock is held on this pset */
821 	processor_t next_rt_processor = PROCESSOR_NULL;
822 	/* Optimization so caller can avoid unnecessary lock-takes if there are no psets eligible for spill: */
823 	bool starting_pset_was_unlocked = false;
824 
825 	cpumap_t candidate_map = ~BIT(starting_pset->pset_id); /* exclude stealing_pset */
826 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
827 	while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, candidate_map, &istate)) {
828 		assert3u(starting_pset->pset_id, !=, istate.spis_pset_id);
829 		/* Check for null pset because we do not use acquire/release semantics for spill order. */
830 		processor_set_t nset = pset_array[istate.spis_pset_id];
831 		if (__improbable(nset == PROCESSOR_SET_NULL)) {
832 			continue;
833 		}
834 
835 		/* Make sure the pset is allowed to steal threads from stealing_pset's runqueue. */
836 		sched_clutch_edge edge = sched_rt_config_get((pset_id_t) starting_pset->pset_id, (pset_id_t) istate.spis_pset_id);
837 		if (istate.spis_pset_id != starting_pset->pset_id && edge.sce_steal_allowed == false) {
838 			continue;
839 		}
840 		pset = change_locked_pset(pset, nset);
841 		starting_pset_was_unlocked = true;
842 
843 		next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true);
844 		if (next_rt_processor != PROCESSOR_NULL) {
845 			break;
846 		}
847 	}
848 
849 	if (next_rt_processor != PROCESSOR_NULL) {
850 		if (pset != starting_pset) {
851 			if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
852 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
853 				    next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, spill_tid);
854 			}
855 		}
856 		*result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
857 		*result_processor = next_rt_processor;
858 	}
859 	if (starting_pset_was_unlocked) {
860 		pset_unlock(pset);
861 		return true;
862 	} else {
863 		return false;
864 	}
865 }
866 #endif /* CONFIG_SCHED_EDGE */
867 
868 bool
rt_pset_needs_a_followup_IPI(processor_set_t pset)869 rt_pset_needs_a_followup_IPI(processor_set_t pset)
870 {
871 	int nbackup_cpus = 0;
872 
873 	if (rt_runq_is_low_latency(pset)) {
874 		nbackup_cpus = sched_rt_n_backup_processors;
875 	}
876 
877 	int rt_rq_count = rt_runq_count(pset);
878 
879 	return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
880 }
881 
882 /*
883  * Returns the next processor to IPI as a followup for low-latency realtime
884  * threads on the runqueue.
885  *
886  * pset should be locked, and stays locked the whole time.
887  */
888 void
rt_choose_next_processor_for_followup_IPI(processor_set_t pset,processor_t chosen_processor,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)889 rt_choose_next_processor_for_followup_IPI(
890 	processor_set_t pset,
891 	processor_t chosen_processor,
892 	processor_t *result_processor,
893 	sched_ipi_type_t *result_ipi_type)
894 {
895 	uint64_t earliest_deadline = rt_runq_earliest_deadline(pset);
896 	int max_pri = rt_runq_priority(pset);
897 	processor_t next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true);
898 	if (next_rt_processor != PROCESSOR_NULL) {
899 		*result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
900 		*result_processor = next_rt_processor;
901 	}
902 }
903 
904 #if CONFIG_SCHED_SMT
905 extern int sched_avoid_cpu0;
906 extern int sched_allow_rt_smt;
907 
908 /* pset is locked */
909 processor_t
pset_choose_processor_for_realtime_thread_smt(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)910 pset_choose_processor_for_realtime_thread_smt(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
911 {
912 #if defined(__x86_64__)
913 	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
914 #else
915 	const bool avoid_cpu0 = false;
916 #endif
917 	cpumap_t cpu_map;
918 
919 try_again:
920 	cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
921 	if (skip_processor) {
922 		bit_clear(cpu_map, skip_processor->cpu_id);
923 	}
924 	if (skip_spills) {
925 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
926 	}
927 
928 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
929 		bit_clear(cpu_map, 0);
930 	}
931 
932 	cpumap_t primary_map = cpu_map & pset->primary_map;
933 	if (avoid_cpu0) {
934 		primary_map = bit_ror64(primary_map, 1);
935 	}
936 
937 	int rotid = lsb_first(primary_map);
938 	if (rotid >= 0) {
939 		int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
940 
941 		processor_t processor = processor_array[cpuid];
942 
943 		return processor;
944 	}
945 
946 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
947 		goto out;
948 	}
949 
950 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
951 		/* Also avoid cpu1 */
952 		bit_clear(cpu_map, 1);
953 	}
954 
955 	/* Consider secondary processors whose primary is actually running a realtime thread */
956 	cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
957 	if (avoid_cpu0) {
958 		/* Also avoid cpu1 */
959 		secondary_map = bit_ror64(secondary_map, 2);
960 	}
961 	rotid = lsb_first(secondary_map);
962 	if (rotid >= 0) {
963 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
964 
965 		processor_t processor = processor_array[cpuid];
966 
967 		return processor;
968 	}
969 
970 	/* Consider secondary processors */
971 	secondary_map = cpu_map & ~pset->primary_map;
972 	if (avoid_cpu0) {
973 		/* Also avoid cpu1 */
974 		secondary_map = bit_ror64(secondary_map, 2);
975 	}
976 	rotid = lsb_first(secondary_map);
977 	if (rotid >= 0) {
978 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
979 
980 		processor_t processor = processor_array[cpuid];
981 
982 		return processor;
983 	}
984 
985 	/*
986 	 * I was hoping the compiler would optimize
987 	 * this away when avoid_cpu0 is const bool false
988 	 * but it still complains about the assignmnent
989 	 * in that case.
990 	 */
991 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
992 #if defined(__x86_64__)
993 		avoid_cpu0 = false;
994 #else
995 		assert(0);
996 #endif
997 		goto try_again;
998 	}
999 
1000 out:
1001 	if (skip_processor) {
1002 		return PROCESSOR_NULL;
1003 	}
1004 
1005 	/*
1006 	 * If we didn't find an obvious processor to choose, but there are still more CPUs
1007 	 * not already running realtime threads than realtime threads in the realtime run queue,
1008 	 * this thread belongs in this pset, so choose some other processor in this pset
1009 	 * to ensure the thread is enqueued here.
1010 	 */
1011 	cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
1012 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
1013 		cpu_map = non_realtime_map;
1014 		assert(cpu_map != 0);
1015 		int cpuid = bit_first(cpu_map);
1016 		assert(cpuid >= 0);
1017 		return processor_array[cpuid];
1018 	}
1019 
1020 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
1021 		goto skip_secondaries;
1022 	}
1023 
1024 	non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
1025 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
1026 		cpu_map = non_realtime_map;
1027 		assert(cpu_map != 0);
1028 		int cpuid = bit_first(cpu_map);
1029 		assert(cpuid >= 0);
1030 		return processor_array[cpuid];
1031 	}
1032 
1033 skip_secondaries:
1034 	return PROCESSOR_NULL;
1035 }
1036 #else /* !CONFIG_SCHED_SMT*/
1037 /* pset is locked */
1038 processor_t
pset_choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool skip_spills)1039 pset_choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills)
1040 {
1041 	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
1042 	if (skip_processor) {
1043 		bit_clear(cpu_map, skip_processor->cpu_id);
1044 	}
1045 	if (skip_spills) {
1046 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
1047 	}
1048 
1049 	int rotid = lsb_first(cpu_map);
1050 	if (rotid >= 0) {
1051 		return processor_array[rotid];
1052 	}
1053 
1054 	/*
1055 	 * If we didn't find an obvious processor to choose, but there are still more CPUs
1056 	 * not already running realtime threads than realtime threads in the realtime run queue,
1057 	 * this thread belongs in this pset, so choose some other processor in this pset
1058 	 * to ensure the thread is enqueued here.
1059 	 */
1060 	cpumap_t non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
1061 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
1062 		cpu_map = non_realtime_map;
1063 		assert(cpu_map != 0);
1064 		int cpuid = bit_first(cpu_map);
1065 		assert(cpuid >= 0);
1066 		return processor_array[cpuid];
1067 	}
1068 
1069 	return PROCESSOR_NULL;
1070 }
1071 #endif /* !CONFIG_SCHED_SMT */
1072 
1073 /*
1074  * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
1075  * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
1076  *
1077  * pset is locked.
1078  */
1079 processor_t
pset_choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)1080 pset_choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
1081 {
1082 	uint64_t  furthest_deadline = rt_deadline_add(minimum_deadline, rt_deadline_epsilon);
1083 	processor_t fd_processor = PROCESSOR_NULL;
1084 	int lowest_priority = max_pri;
1085 
1086 	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
1087 	if (skip_processor) {
1088 		bit_clear(cpu_map, skip_processor->cpu_id);
1089 	}
1090 	if (skip_spills) {
1091 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
1092 	}
1093 
1094 	for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
1095 		processor_t processor = processor_array[cpuid];
1096 
1097 		if (processor->current_pri > lowest_priority) {
1098 			continue;
1099 		}
1100 
1101 		if (processor->current_pri < lowest_priority) {
1102 			lowest_priority = processor->current_pri;
1103 			furthest_deadline = processor->deadline;
1104 			fd_processor = processor;
1105 			continue;
1106 		}
1107 
1108 		if (processor->deadline > furthest_deadline) {
1109 			furthest_deadline = processor->deadline;
1110 			fd_processor = processor;
1111 		}
1112 	}
1113 
1114 	if (fd_processor) {
1115 		return fd_processor;
1116 	}
1117 
1118 	/*
1119 	 * There is a race condition possible when there are multiple processor sets.
1120 	 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
1121 	 * so it drops pset lock A and tries to take pset lock B.  Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
1122 	 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
1123 	 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
1124 	 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
1125 	 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
1126 	 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
1127 	 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
1128 	 *
1129 	 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
1130 	 * on the run queue of that pset.
1131 	 */
1132 	if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
1133 		cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
1134 		assert(skip_processor == PROCESSOR_NULL);
1135 		assert(skip_spills == false);
1136 
1137 		for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
1138 			processor_t processor = processor_array[cpuid];
1139 
1140 			if (processor->current_pri > lowest_priority) {
1141 				continue;
1142 			}
1143 
1144 			if (processor->current_pri < lowest_priority) {
1145 				lowest_priority = processor->current_pri;
1146 				furthest_deadline = processor->deadline;
1147 				fd_processor = processor;
1148 				continue;
1149 			}
1150 
1151 			if (processor->deadline > furthest_deadline) {
1152 				furthest_deadline = processor->deadline;
1153 				fd_processor = processor;
1154 			}
1155 		}
1156 	}
1157 
1158 	return fd_processor;
1159 }
1160 
1161 bool
rt_clear_pending_spill(processor_t processor,int reason)1162 rt_clear_pending_spill(processor_t processor, int reason)
1163 {
1164 	processor_set_t pset = processor->processor_set;
1165 	if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
1166 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, reason);
1167 		return true;
1168 	} else {
1169 		return false;
1170 	}
1171 }
1172 
1173 #pragma mark - Realtime Runqueues
1174 
1175 #if DEBUG || SCHED_TEST_HARNESS
1176 void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)1177 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
1178 {
1179 	__assert_only bitmap_t *map = rt_run_queue->bitmap;
1180 
1181 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
1182 	uint32_t constraint = RT_CONSTRAINT_NONE;
1183 	int ed_index = NOPRI;
1184 	int count = 0;
1185 	bool found_thread = false;
1186 
1187 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
1188 		int i = pri - BASEPRI_RTQUEUES;
1189 		rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1190 		queue_t queue = &rt_runq->pri_queue;
1191 		queue_entry_t iter;
1192 		int n = 0;
1193 		uint64_t previous_deadline = 0;
1194 		qe_foreach(iter, queue) {
1195 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
1196 			assert_thread_magic(iter_thread);
1197 			if (iter_thread == thread) {
1198 				found_thread = true;
1199 			}
1200 			assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
1201 			assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
1202 			assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
1203 			assert(previous_deadline <= iter_thread->realtime.deadline);
1204 			n++;
1205 			if (iter == queue_first(queue)) {
1206 				assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
1207 				assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
1208 			}
1209 			previous_deadline = iter_thread->realtime.deadline;
1210 		}
1211 		assert(n == rt_runq->pri_count);
1212 		if (n == 0) {
1213 			assert(bitmap_test(map, i) == false);
1214 			assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
1215 			assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
1216 		} else {
1217 			assert(bitmap_test(map, i) == true);
1218 		}
1219 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
1220 			earliest_deadline = rt_runq->pri_earliest_deadline;
1221 			constraint = rt_runq->pri_constraint;
1222 			ed_index = i;
1223 		}
1224 		count += n;
1225 	}
1226 	assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
1227 	assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
1228 	assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
1229 	assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
1230 	if (thread) {
1231 		assert(found_thread);
1232 	}
1233 }
1234 #endif /* DEBUG || SCHED_TEST_HARNESS */
1235 
1236 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)1237 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
1238 {
1239 	int pri = thread->sched_pri;
1240 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
1241 	int i = pri - BASEPRI_RTQUEUES;
1242 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1243 	bitmap_t *map = rt_run_queue->bitmap;
1244 
1245 	bitmap_set(map, i);
1246 
1247 	queue_t     queue       = &rt_runq->pri_queue;
1248 	uint64_t    deadline    = thread->realtime.deadline;
1249 	bool        preempt     = false;
1250 	bool        earliest    = false;
1251 
1252 	if (queue_empty(queue)) {
1253 		enqueue_tail(queue, &thread->runq_links);
1254 		preempt = true;
1255 		earliest = true;
1256 		rt_runq->pri_earliest_deadline = deadline;
1257 		rt_runq->pri_constraint = thread->realtime.constraint;
1258 	} else {
1259 		/* Insert into rt_runq in thread deadline order */
1260 		queue_entry_t iter;
1261 		qe_foreach(iter, queue) {
1262 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
1263 			assert_thread_magic(iter_thread);
1264 
1265 			if (deadline < iter_thread->realtime.deadline) {
1266 				if (iter == queue_first(queue)) {
1267 					preempt = true;
1268 					earliest = true;
1269 					rt_runq->pri_earliest_deadline = deadline;
1270 					rt_runq->pri_constraint = thread->realtime.constraint;
1271 				}
1272 				insque(&thread->runq_links, queue_prev(iter));
1273 				break;
1274 			} else if (iter == queue_last(queue)) {
1275 				enqueue_tail(queue, &thread->runq_links);
1276 				break;
1277 			}
1278 		}
1279 	}
1280 	if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
1281 		os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
1282 		os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
1283 		os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
1284 	}
1285 
1286 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
1287 	rt_runq->pri_count++;
1288 	os_atomic_inc(&rt_run_queue->count, relaxed);
1289 
1290 	thread_set_runq_locked(thread, processor);
1291 
1292 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
1293 
1294 	return preempt;
1295 }
1296 
1297 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)1298 rt_runq_earliest_deadline(processor_set_t pset)
1299 {
1300 	return os_atomic_load_wide(&pset->rt_runq.earliest_deadline, relaxed);
1301 }
1302 
1303 /*
1304  *	rt_runq_insert:
1305  *
1306  *	Enqueue a thread for realtime execution.
1307  */
1308 bool
rt_runq_insert(processor_t processor,processor_set_t pset,thread_t thread)1309 rt_runq_insert(processor_t processor, processor_set_t pset, thread_t thread)
1310 {
1311 	pset_assert_locked(pset);
1312 
1313 	bool preempt = rt_runq_enqueue(&pset->rt_runq, thread, processor);
1314 	pset_update_rt_stealable_state(pset);
1315 
1316 	return preempt;
1317 }
1318 
1319 int
rt_runq_count(processor_set_t pset)1320 rt_runq_count(processor_set_t pset)
1321 {
1322 	return os_atomic_load(&pset->rt_runq.count, relaxed);
1323 }
1324 
1325 int
rt_runq_priority(processor_set_t pset)1326 rt_runq_priority(processor_set_t pset)
1327 {
1328 	pset_assert_locked(pset);
1329 	rt_queue_t rt_run_queue = &pset->rt_runq;
1330 
1331 	bitmap_t *map = rt_run_queue->bitmap;
1332 	int i = bitmap_first(map, NRTQS);
1333 	assert(i < NRTQS);
1334 
1335 	if (i >= 0) {
1336 		return i + BASEPRI_RTQUEUES;
1337 	}
1338 
1339 	return i;
1340 }
1341 
1342 bool
rt_runq_is_low_latency(processor_set_t pset)1343 rt_runq_is_low_latency(processor_set_t pset)
1344 {
1345 	return os_atomic_load(&pset->rt_runq.constraint, relaxed) <= rt_constraint_threshold;
1346 }
1347 
1348 thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)1349 rt_runq_dequeue(rt_queue_t rt_run_queue)
1350 {
1351 	bitmap_t *map = rt_run_queue->bitmap;
1352 	int i = bitmap_first(map, NRTQS);
1353 	assert((i >= 0) && (i < NRTQS));
1354 
1355 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1356 
1357 	if (!sched_rt_runq_strict_priority) {
1358 		int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
1359 		if (ed_index != i) {
1360 			assert((ed_index >= 0) && (ed_index < NRTQS));
1361 			rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
1362 
1363 			thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
1364 			thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1365 
1366 			if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
1367 				/* choose the earliest deadline thread */
1368 				rt_runq = ed_runq;
1369 				i = ed_index;
1370 			}
1371 		}
1372 	}
1373 
1374 	assert(rt_runq->pri_count > 0);
1375 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
1376 	uint32_t constraint = RT_CONSTRAINT_NONE;
1377 	int ed_index = NOPRI;
1378 	thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
1379 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
1380 	if (--rt_runq->pri_count > 0) {
1381 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1382 		assert(next_rt != THREAD_NULL);
1383 		earliest_deadline = next_rt->realtime.deadline;
1384 		constraint = next_rt->realtime.constraint;
1385 		ed_index = i;
1386 	} else {
1387 		bitmap_clear(map, i);
1388 	}
1389 	rt_runq->pri_earliest_deadline = earliest_deadline;
1390 	rt_runq->pri_constraint = constraint;
1391 
1392 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
1393 		rt_runq = &rt_run_queue->rt_queue_pri[i];
1394 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
1395 			earliest_deadline = rt_runq->pri_earliest_deadline;
1396 			constraint = rt_runq->pri_constraint;
1397 			ed_index = i;
1398 		}
1399 	}
1400 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
1401 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
1402 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
1403 	os_atomic_dec(&rt_run_queue->count, relaxed);
1404 
1405 	thread_clear_runq(new_thread);
1406 
1407 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
1408 
1409 	return new_thread;
1410 }
1411 
1412 thread_t
rt_runq_first(rt_queue_t rt_run_queue)1413 rt_runq_first(rt_queue_t rt_run_queue)
1414 {
1415 	bitmap_t *map = rt_run_queue->bitmap;
1416 	int i = bitmap_first(map, NRTQS);
1417 	if (i < 0) {
1418 		return THREAD_NULL;
1419 	}
1420 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1421 	thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1422 
1423 	return next_rt;
1424 }
1425 
1426 void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)1427 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
1428 {
1429 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
1430 
1431 	int pri = thread->sched_pri;
1432 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
1433 	int i = pri - BASEPRI_RTQUEUES;
1434 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1435 	bitmap_t *map = rt_run_queue->bitmap;
1436 
1437 	assert(rt_runq->pri_count > 0);
1438 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
1439 	uint32_t constraint = RT_CONSTRAINT_NONE;
1440 	int ed_index = NOPRI;
1441 	remqueue(&thread->runq_links);
1442 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
1443 	if (--rt_runq->pri_count > 0) {
1444 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1445 		earliest_deadline = next_rt->realtime.deadline;
1446 		constraint = next_rt->realtime.constraint;
1447 		ed_index = i;
1448 	} else {
1449 		bitmap_clear(map, i);
1450 	}
1451 	rt_runq->pri_earliest_deadline = earliest_deadline;
1452 	rt_runq->pri_constraint = constraint;
1453 
1454 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
1455 		rt_runq = &rt_run_queue->rt_queue_pri[i];
1456 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
1457 			earliest_deadline = rt_runq->pri_earliest_deadline;
1458 			constraint = rt_runq->pri_constraint;
1459 			ed_index = i;
1460 		}
1461 	}
1462 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
1463 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
1464 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
1465 	os_atomic_dec(&rt_run_queue->count, relaxed);
1466 
1467 	thread_clear_runq_locked(thread);
1468 
1469 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
1470 }
1471