xref: /xnu-12377.61.12/osfmk/kern/sched_rt.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8)
1 /*
2  * Copyright (c) 2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <stdint.h>
30 
31 #include <kern/bits.h>
32 #include <kern/kern_types.h>
33 #include <kern/processor.h>
34 #include <kern/sched_clutch.h>
35 #include <kern/sched_common.h>
36 #include <kern/sched_prim.h>
37 #include <kern/sched_rt.h>
38 #include <kern/thread.h>
39 #include <kern/queue.h>
40 
41 #include <sys/kdebug_kernel.h>
42 
43 #include <os/atomic_private.h>
44 
45 #include <machine/machine_routines.h>
46 
47 #ifdef KDBG_MACOS_RELEASE
48 #define KTRC KDBG_MACOS_RELEASE
49 #else
50 #define KTRC KDBG_RELEASE
51 #endif
52 
53 #pragma mark - Constants and Tunables
54 
55 #if (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS)
56 #include <kern/startup.h>
57 
58 /*
59  * Tunables controlling how xnu initializes the realtime matrix. CLPC can
60  * override their effects with sched_perfcontrol interfaces.
61  */
62 
63 TUNABLE(unsigned int, sched_rt_spill_policy, "sched_rt_spill_policy", 1);
64 
65 TUNABLE(unsigned, sched_rt_steal_policy, "sched_rt_steal_policy", 2);
66 #endif /* (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS) */
67 
68 uint32_t rt_deadline_epsilon;
69 uint32_t rt_constraint_threshold;
70 /* epsilon for comparing RT deadlines */
71 int rt_deadline_epsilon_us = 100;
72 uint32_t max_rt_quantum;
73 uint32_t min_rt_quantum;
74 int sched_allow_rt_smt = 1;
75 int sched_rt_runq_strict_priority = false;
76 
77 int
sched_get_rt_deadline_epsilon(void)78 sched_get_rt_deadline_epsilon(void)
79 {
80 	return rt_deadline_epsilon_us;
81 }
82 
83 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)84 sched_set_rt_deadline_epsilon(int new_epsilon_us)
85 {
86 	rt_deadline_epsilon_us = new_epsilon_us;
87 
88 	uint64_t abstime;
89 	clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
90 	assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
91 	rt_deadline_epsilon = (uint32_t)abstime;
92 }
93 
94 #pragma mark - Initialization
95 
96 static int sched_rt_max_clusters = 0;
97 
98 void
sched_realtime_timebase_init(void)99 sched_realtime_timebase_init(void)
100 {
101 	uint64_t abstime;
102 
103 	/* smallest rt computation (50 us) */
104 	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
105 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
106 	min_rt_quantum = (uint32_t)abstime;
107 
108 	/* maximum rt computation (50 ms) */
109 	clock_interval_to_absolutetime_interval(
110 		50, 1000 * NSEC_PER_USEC, &abstime);
111 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
112 	max_rt_quantum = (uint32_t)abstime;
113 
114 	/* constraint threshold for sending backup IPIs (4 ms) */
115 	clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
116 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
117 	rt_constraint_threshold = (uint32_t)abstime;
118 
119 	/* epsilon for comparing deadlines */
120 	sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
121 }
122 
123 #if CONFIG_SCHED_EDGE
124 /* forward-declare config utility */
125 static void
126 sched_rt_config_pset_push(processor_set_t pset);
127 #endif /* CONFIG_SCHED_EDGE */
128 
129 static void
rt_init_completed(void)130 rt_init_completed(void)
131 {
132 	/* This should be unified with sched_edge_max_clusters and moved to a common location. <rdar://145162647> */
133 	sched_rt_max_clusters = ml_get_cluster_count();
134 
135 	/* Realtime spill/steal are only supported on platforms with the edge scheduler. */
136 #if CONFIG_SCHED_EDGE
137 	/* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */
138 	spl_t s = splsched();
139 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
140 	for (int src_cluster_id = 0; src_cluster_id < sched_rt_max_clusters; src_cluster_id++) {
141 		processor_set_t src_pset = pset_array[src_cluster_id];
142 		assert3p(src_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */
143 
144 		/* For each cluster, set all its outgoing edge parameters */
145 		for (int dst_cluster_id = 0; dst_cluster_id < sched_rt_max_clusters; dst_cluster_id++) {
146 			if (dst_cluster_id == src_cluster_id) {
147 				continue;
148 			}
149 			processor_set_t dst_pset = pset_array[dst_cluster_id];
150 			assert3p(dst_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */
151 
152 			bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type);
153 			if (clusters_homogenous) {
154 				/* Default realtime policy: spill allowed among homogeneous psets. */
155 				sched_rt_config_set((pset_id_t) src_cluster_id, (pset_id_t) dst_cluster_id, (sched_clutch_edge) {
156 					.sce_migration_allowed = true,
157 					.sce_steal_allowed = true,
158 					.sce_migration_weight = 0,
159 				});
160 			} else {
161 				/* Default realtime policy: disallow spill among heterogeneous psets. */
162 				sched_rt_config_set((pset_id_t) src_cluster_id, (pset_id_t) dst_cluster_id, (sched_clutch_edge) {
163 					.sce_migration_allowed = false,
164 					.sce_steal_allowed = false,
165 					.sce_migration_weight = 0,
166 				});
167 			}
168 		}
169 	}
170 
171 
172 	for (pset_id_t pset_id = 0; pset_id < sched_rt_max_clusters; pset_id++) {
173 		sched_rt_config_pset_push(pset_array[pset_id]);
174 	}
175 
176 	simple_unlock(&sched_available_cores_lock);
177 	splx(s);
178 #endif /* CONFIG_SCHED_EDGE */
179 }
180 
181 static void
pset_rt_init(processor_set_t pset)182 pset_rt_init(processor_set_t pset)
183 {
184 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
185 		int i = pri - BASEPRI_RTQUEUES;
186 		rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
187 		queue_init(&rqi->pri_queue);
188 		rqi->pri_count = 0;
189 		rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
190 		rqi->pri_constraint = RT_CONSTRAINT_NONE;
191 	}
192 	os_atomic_init(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE);
193 
194 	rt_queue_t rt_runq = &pset->rt_runq;
195 	os_atomic_init(&rt_runq->count, 0);
196 	os_atomic_init(&rt_runq->earliest_deadline, RT_DEADLINE_NONE);
197 	os_atomic_init(&rt_runq->constraint, RT_CONSTRAINT_NONE);
198 	os_atomic_init(&rt_runq->ed_index, NOPRI);
199 	bzero(&rt_runq->bitmap, sizeof(rt_runq->bitmap));
200 	bzero(&rt_runq->runq_stats, sizeof(rt_runq->runq_stats));
201 
202 #if __AMP__
203 	/*
204 	 * Initialize spill/steal search orders as invalid to prevent spill/steal
205 	 * before the matrix is configured.
206 	 */
207 	bzero(pset->sched_rt_edges, sizeof(pset->sched_rt_edges));
208 	for (pset_id_t i = 0; i < MAX_PSETS - 1; i++) {
209 		pset->sched_rt_spill_search_order.spso_search_order[i] = PSET_ID_INVALID;
210 #if CONFIG_SCHED_EDGE
211 		pset->sched_rt_steal_search_order.spso_search_order[i] = PSET_ID_INVALID;
212 #endif /* CONFIG_SCHED_EDGE */
213 	}
214 #endif /* __AMP__ */
215 }
216 
217 #pragma mark - Realtime Scheduler/CLPC interface
218 
219 #if CONFIG_SCHED_EDGE
220 void
sched_rt_config_set(uint8_t src_pset,uint8_t dst_pset,sched_clutch_edge edge_config)221 sched_rt_config_set(
222 	uint8_t src_pset,
223 	uint8_t dst_pset,
224 	sched_clutch_edge edge_config)
225 {
226 	assert(src_pset != dst_pset || !edge_config.sce_migration_allowed); /* No self-edges. */
227 	os_atomic_store(&pset_array[src_pset]->sched_rt_edges[dst_pset], edge_config, relaxed);
228 }
229 
230 sched_clutch_edge
sched_rt_config_get(uint8_t src_pset,uint8_t dst_pset)231 sched_rt_config_get(
232 	uint8_t src_pset,
233 	uint8_t dst_pset)
234 {
235 	return os_atomic_load(&pset_array[src_pset]->sched_rt_edges[dst_pset], relaxed);
236 }
237 
238 void
sched_rt_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_requests,uint64_t num_psets)239 sched_rt_matrix_get(
240 	sched_clutch_edge *edge_matrix,
241 	bool *edge_requests,
242 	uint64_t num_psets)
243 {
244 	uint64_t edge_index = 0;
245 	for (uint8_t src_pset = 0; src_pset < num_psets; src_pset++) {
246 		for (uint8_t dst_pset = 0; dst_pset < num_psets; dst_pset++) {
247 			if (edge_requests[edge_index]) {
248 				edge_matrix[edge_index] = sched_rt_config_get(src_pset, dst_pset);
249 			}
250 			edge_index++;
251 		}
252 	}
253 }
254 
255 /*
256  * sched_rt_config_pset_push()
257  *
258  * After using sched_rt_config_set() to update edge tunables outgoing from a particular source
259  * pset, this function should be called in order to propagate the updates to derived metadata for
260  * the pset, such as search orders for outgoing spill and steal.
261  */
262 static void
sched_rt_config_pset_push(processor_set_t pset)263 sched_rt_config_pset_push(processor_set_t pset)
264 {
265 	assert3u(pset->pset_id, <, UINT8_MAX);
266 
267 	sched_pset_search_order_sort_data_t spill_datas[MAX_PSETS - 1], steal_datas[MAX_PSETS - 1];
268 	uint num_spill_datas = 0, num_steal_datas = 0;
269 	for (pset_id_t other_pset_id = 0; other_pset_id < sched_rt_max_clusters; other_pset_id++) {
270 		if (pset->pset_id == other_pset_id) {
271 			continue; /* No self-edges. */
272 		}
273 		/* Spill */
274 		sched_clutch_edge out_edge = sched_rt_config_get((pset_id_t)pset->pset_cluster_id, other_pset_id);
275 		if (out_edge.sce_migration_allowed) {
276 			spill_datas[num_spill_datas++] = (sched_pset_search_order_sort_data_t) {
277 				.spsosd_src_pset = pset,
278 				.spsosd_migration_weight = out_edge.sce_migration_weight,
279 				.spsosd_dst_pset_id = other_pset_id
280 			};
281 		}
282 		/* Steal */
283 		sched_clutch_edge in_edge = sched_rt_config_get(other_pset_id, (pset_id_t)pset->pset_cluster_id);
284 		if (in_edge.sce_steal_allowed) {
285 			steal_datas[num_steal_datas++] = (sched_pset_search_order_sort_data_t) {
286 				.spsosd_src_pset = pset,
287 				.spsosd_migration_weight = in_edge.sce_migration_weight,
288 				.spsosd_dst_pset_id = other_pset_id,
289 			};
290 		}
291 	}
292 	sched_pset_search_order_compute(&pset->sched_rt_spill_search_order, spill_datas, num_spill_datas, sched_edge_search_order_weight_then_locality_cmp);
293 	sched_pset_search_order_compute(&pset->sched_rt_steal_search_order, steal_datas, num_steal_datas, sched_edge_search_order_weight_then_locality_cmp);
294 }
295 
296 void
sched_rt_matrix_set(sched_clutch_edge * rt_matrix,bool * edge_changes,uint64_t num_psets)297 sched_rt_matrix_set(
298 	sched_clutch_edge *rt_matrix,
299 	bool *edge_changes,
300 	uint64_t num_psets)
301 {
302 	/* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */
303 	spl_t s = splsched();
304 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
305 
306 	for (uint8_t src_pset_id = 0; src_pset_id < num_psets; src_pset_id++) {
307 		for (uint8_t dst_pset_id = 0; dst_pset_id < num_psets; dst_pset_id++) {
308 			const uint64_t rt_matrix_index = src_pset_id * num_psets + dst_pset_id;
309 			if (edge_changes[rt_matrix_index]) {
310 				sched_rt_config_set(src_pset_id, dst_pset_id, rt_matrix[rt_matrix_index]);
311 			}
312 		}
313 	}
314 
315 	for (pset_id_t pset_id = 0; pset_id < num_psets; pset_id++) {
316 		sched_rt_config_pset_push(pset_array[pset_id]);
317 	}
318 
319 	simple_unlock(&sched_available_cores_lock);
320 	splx(s);
321 }
322 #endif /* CONFIG_SCHED_EDGE */
323 
324 #pragma mark - Scheduler Callouts
325 
326 #if CONFIG_SCHED_SMT
327 /*
328  * SMT-aware callout for rt_choose_processor.
329  */
330 processor_t
sched_rtlocal_choose_processor_smt(processor_set_t starting_pset,processor_t processor,thread_t thread)331 sched_rtlocal_choose_processor_smt(
332 	processor_set_t         starting_pset,
333 	processor_t             processor,
334 	thread_t                thread)
335 {
336 	processor_set_t nset = PROCESSOR_SET_NULL;
337 	processor_set_t pset = starting_pset;
338 	pset_node_t node = pset->node;
339 
340 	processor_t lc_processor = processor;
341 	integer_t lowest_count = INT_MAX;
342 	if (lc_processor != PROCESSOR_NULL) {
343 		lowest_count = SCHED(processor_runq_count)(processor);
344 	}
345 
346 	bool include_ast_urgent_pending_cpus = false;
347 	cpumap_t ast_urgent_pending;
348 try_again:
349 	ast_urgent_pending = 0;
350 	int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
351 	for (; consider_secondaries < 2; consider_secondaries++) {
352 		pset = change_locked_pset(pset, starting_pset);
353 		do {
354 			cpumap_t available_map = pset_available_cpumap(pset);
355 			if (available_map == 0) {
356 				goto no_available_cpus;
357 			}
358 
359 			processor = pset_choose_processor_for_realtime_thread_smt(pset, PROCESSOR_NULL, consider_secondaries, false);
360 			if (processor) {
361 				return processor;
362 			}
363 
364 			if (consider_secondaries) {
365 				processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
366 				if (processor) {
367 					/*
368 					 * Instead of looping through all the psets to find the global
369 					 * furthest deadline processor, preempt the first candidate found.
370 					 * The preempted thread will then find any other available far deadline
371 					 * processors to preempt.
372 					 */
373 					return processor;
374 				}
375 
376 				ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
377 
378 				if (rt_runq_count(pset) < lowest_count) {
379 					int cpuid = bit_first(available_map);
380 					assert(cpuid >= 0);
381 					lc_processor = processor_array[cpuid];
382 					lowest_count = rt_runq_count(pset);
383 				}
384 			}
385 
386 no_available_cpus:
387 			nset = next_pset(pset);
388 
389 			if (nset != starting_pset) {
390 				pset = change_locked_pset(pset, nset);
391 			}
392 		} while (nset != starting_pset);
393 	}
394 
395 	/* Short cut for single pset nodes */
396 	if (bit_count(node->pset_map) == 1) {
397 		if (lc_processor) {
398 			pset_assert_locked(lc_processor->processor_set);
399 			return lc_processor;
400 		}
401 	} else {
402 		if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
403 			/* See the comment in pset_choose_furthest_deadline_processor_for_realtime_thread() */
404 			include_ast_urgent_pending_cpus = true;
405 			goto try_again;
406 		}
407 	}
408 
409 	processor = lc_processor;
410 
411 	if (processor) {
412 		pset = change_locked_pset(pset, processor->processor_set);
413 		/* Check that chosen processor is still usable */
414 		cpumap_t available_map = pset_available_cpumap(pset);
415 		if (bit_test(available_map, processor->cpu_id)) {
416 			return processor;
417 		}
418 
419 		/* processor is no longer usable */
420 		processor = PROCESSOR_NULL;
421 	}
422 
423 	pset_assert_locked(pset);
424 	pset_unlock(pset);
425 	return PROCESSOR_NULL;
426 }
427 #else /* !CONFIG_SCHED_SMT */
428 /*
429  * Called with thread and starting_pset locked. The returned processor's pset is
430  * locked on return.
431  */
432 processor_t
sched_rt_choose_processor(const processor_set_t starting_pset,processor_t processor,thread_t thread)433 sched_rt_choose_processor(
434 	const processor_set_t starting_pset,
435 	processor_t processor,
436 	thread_t thread)
437 {
438 	assert3u(thread->sched_pri, >=, BASEPRI_RTQUEUES);
439 	assert3u(thread->sched_pri, <=, MAXPRI);
440 
441 	/*
442 	 * In choose_starting_pset, we found a good candidate pset for this thread.
443 	 * Now, we pick the best processor to preempt, if there is one.  It is also
444 	 * possible that conditions have changed and the thread should spill to
445 	 * another pset.
446 	 */
447 
448 	processor_set_t pset = starting_pset; /* Lock is held on this pset. */
449 	pset_assert_locked(pset);
450 
451 #if __AMP__
452 	/*
453 	 * If there are processors with outstanding urgent preemptions, we consider
454 	 * them in a second pass. While we are changing pset locks here, it is
455 	 * possible a processor may resolve its outstanding urgent preemption and
456 	 * become eligible to run this thread. See comment in
457 	 * pset_choose_furthest_deadline_processor_for_realtime_thread().
458 	 */
459 	bool found_ast_urgent_pending = false; /* Tracks whether any (eligible) processors have pending urgent ASTs. */
460 	for (int include_ast_urgent_pending_cpus = 0; include_ast_urgent_pending_cpus < 2; include_ast_urgent_pending_cpus++) {
461 		if (include_ast_urgent_pending_cpus && !found_ast_urgent_pending) {
462 			break; /* Skip the second pass. */
463 		}
464 
465 		sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
466 		while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, ~0, &istate)) {
467 			/* Switch to the next pset. We need to check for null psets because
468 			 * we do not use acquire/release semantics for the spill order. */
469 			processor_set_t nset = pset_array[istate.spis_pset_id];
470 			if (__improbable(nset == PROCESSOR_SET_NULL)) {
471 				continue;
472 			}
473 			pset = change_locked_pset(pset, nset);
474 
475 			processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false);
476 			if (processor != PROCESSOR_NULL) {
477 				/* We found a candidate processor on this pset to wake or preempt. */
478 				pset_assert_locked(processor->processor_set);
479 				return processor;
480 			}
481 
482 			/* TODO <rdar://140219824>: Policy question of EDF vs targeting idle cores on another pset. */
483 			processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
484 			if (processor) {
485 				/*
486 				 * Instead of looping through all the psets to find the global
487 				 * furthest deadline processor, preempt the first candidate found.
488 				 * The preempted thread will then find any other available far deadline
489 				 * processors to preempt.
490 				 */
491 				pset_assert_locked(processor->processor_set);
492 				return processor;
493 			}
494 
495 			found_ast_urgent_pending = found_ast_urgent_pending || (pset->pending_AST_URGENT_cpu_mask != 0);
496 		}
497 	}
498 
499 	/*
500 	 * There was no obvious (idle or non-realtime) processor to run the thread.
501 	 * Instead, do EDF scheduling again on starting_pset, putting the thread on
502 	 * the run queue if there is no processor to preempt.
503 	 */
504 
505 	pset = change_locked_pset(pset, starting_pset);
506 #endif /* __AMP__ */
507 
508 	/* Check (again, for AMP systems) that there is no lower-priority or idle processor. */
509 	processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false);
510 	if (processor != PROCESSOR_NULL) {
511 		/* We found a candidate processor on this pset to wake or preempt. */
512 		pset_assert_locked(processor->processor_set);
513 		return processor;
514 	}
515 
516 	processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, true);
517 	if (processor == PROCESSOR_NULL) {
518 		/* Choose an arbitrary available and recommended processor from the pset.
519 		 * It won't get preempted anyways, since this thread has a later
520 		 * deadline. */
521 		int processor_id = lsb_first(pset_available_cpumap(pset));
522 
523 		/* starting_pset had available, recommended processors coming into
524 		 * rt_choose_processor(), but that might have changed after dropping the
525 		 * pset lock. If there are no such processors, bail out here and let
526 		 * sched_edge_migrate_candidate() find a better starting pset. */
527 		if (processor_id < 0) {
528 			pset_unlock(pset);
529 			return PROCESSOR_NULL;
530 		}
531 
532 		processor = processor_array[processor_id];
533 	}
534 
535 	pset_assert_locked(processor->processor_set);
536 	return processor;
537 }
538 #endif /* !CONFIG_SCHED_SMT */
539 
540 #if CONFIG_SCHED_EDGE
541 /*
542  * Called with stealing_pset locked and returns with stealing_pset locked but
543  * the lock will have been dropped if a thread is returned. The lock may have
544  * been temporarily dropped, even if no thread is returned.
545  */
546 thread_t
sched_rt_steal_thread(processor_set_t stealing_pset)547 sched_rt_steal_thread(processor_set_t stealing_pset)
548 {
549 	uint64_t earliest_deadline = rt_runq_earliest_deadline(stealing_pset);
550 	processor_set_t pset = stealing_pset;
551 
552 	/* Continue searching until there are no steal candidates found in a single iteration. */
553 	while (true) {
554 		processor_set_t target_pset = NULL;
555 		uint64_t target_deadline;
556 		if (__improbable(os_sub_overflow(earliest_deadline, rt_deadline_epsilon, &target_deadline))) {
557 			target_deadline = 0;
558 		}
559 
560 		sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
561 		while (sched_iterate_psets_ordered(stealing_pset, &stealing_pset->sched_rt_steal_search_order, ~BIT(stealing_pset->pset_id), &istate)) {
562 			assert3s(istate.spis_pset_id, !=, stealing_pset->pset_id); /* stealing_pset's runqueue is drained by sched_rt_choose_processor */
563 			const processor_set_t nset = pset_array[istate.spis_pset_id];
564 			/* Check for null because we do not use acquire/release semantics for steal order. */
565 			if (__improbable(nset == PROCESSOR_SET_NULL)) {
566 				continue;
567 			}
568 			uint64_t nset_deadline = os_atomic_load(&nset->stealable_rt_threads_earliest_deadline, relaxed);
569 			if (nset_deadline < target_deadline) {
570 				target_pset = nset;
571 				target_deadline = nset_deadline;
572 			}
573 		}
574 
575 		if (target_pset != PROCESSOR_SET_NULL) {
576 			assert3u(target_deadline, !=, RT_DEADLINE_NONE);
577 
578 			/* target_pset is a candidate for steal. Check again under its pset lock. */
579 
580 			pset = change_locked_pset(pset, target_pset);
581 			if (os_atomic_load(&pset->stealable_rt_threads_earliest_deadline, relaxed) <= target_deadline) {
582 				/* Steal the next thread from target_pset's runqueue. */
583 				thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
584 				pset_update_rt_stealable_state(pset);
585 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
586 
587 				pset = change_locked_pset(pset, stealing_pset);
588 				return new_thread;
589 			} else {
590 				/* Failed to steal (another pset stole first). Try again. */
591 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(THREAD_NULL), pset->pset_id, pset->cpu_set_low, 1);
592 				pset = change_locked_pset(pset, stealing_pset);
593 				/* Update earliest_deadline in case it changed while the stealing_pset lock was not held. */
594 				earliest_deadline = rt_runq_earliest_deadline(pset);
595 				continue;
596 			}
597 		} else {
598 			/* No steal candidates, stop searching. */
599 			break;
600 		}
601 	}
602 	/* No stealable threads, return with stealing_pset locked. */
603 	pset = change_locked_pset(pset, stealing_pset);
604 	return THREAD_NULL;
605 }
606 #endif /* CONFIG_SCHED_EDGE */
607 
608 /*
609  * processor's pset is locked, may drop and retake the lock
610  */
611 thread_t
sched_rt_choose_thread(processor_t processor)612 sched_rt_choose_thread(processor_t processor)
613 {
614 	processor_set_t pset = processor->processor_set;
615 	pset_assert_locked(pset);
616 
617 	if (SCHED(rt_steal_thread) != NULL) {
618 		do {
619 			rt_clear_pending_spill(processor, 2);
620 			thread_t new_thread = SCHED(rt_steal_thread)(pset);
621 			/* pset lock may have been dropped and retaken, is currently locked */
622 			pset_assert_locked(pset);
623 			if (new_thread != THREAD_NULL) {
624 				/* Spill might have been set if the pset lock was dropped in steal. */
625 				rt_clear_pending_spill(processor, 3);
626 				return new_thread;
627 			}
628 		} while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
629 	}
630 	rt_clear_pending_spill(processor, 5);
631 
632 	if (rt_runq_count(pset) > 0) {
633 		thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
634 		assert(new_thread != THREAD_NULL);
635 		pset_update_rt_stealable_state(pset);
636 		return new_thread;
637 	}
638 
639 	return THREAD_NULL;
640 }
641 
642 void
sched_rt_init_pset(processor_set_t pset)643 sched_rt_init_pset(processor_set_t pset)
644 {
645 	pset_rt_init(pset);
646 }
647 
648 void
sched_rt_init_completed(void)649 sched_rt_init_completed(void)
650 {
651 	rt_init_completed();
652 }
653 
654 void
sched_rt_queue_shutdown(processor_t processor)655 sched_rt_queue_shutdown(processor_t processor)
656 {
657 	processor_set_t pset = processor->processor_set;
658 	thread_t        thread;
659 	queue_head_t    tqueue;
660 
661 	pset_lock(pset);
662 
663 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
664 	if (bit_count(pset_available_cpumap(pset)) > 0) {
665 		pset_unlock(pset);
666 		return;
667 	}
668 
669 	queue_init(&tqueue);
670 
671 	while (rt_runq_count(pset) > 0) {
672 		thread = rt_runq_dequeue(&pset->rt_runq);
673 		enqueue_tail(&tqueue, &thread->runq_links);
674 	}
675 	sched_update_pset_load_average(pset, 0);
676 	pset_update_rt_stealable_state(pset);
677 	pset_unlock(pset);
678 
679 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
680 		remqueue(&thread->runq_links);
681 
682 		thread_lock(thread);
683 
684 		thread_setrun(thread, SCHED_TAILQ);
685 
686 		thread_unlock(thread);
687 	}
688 }
689 
690 /*
691  * Assumes RT lock is not held, and acquires splsched/rt_lock itself.
692  * Also records tracepoints for pset bitmasks under the pset lock.
693  */
694 void
sched_rt_runq_scan(sched_update_scan_context_t scan_context)695 sched_rt_runq_scan(sched_update_scan_context_t scan_context)
696 {
697 	thread_t        thread;
698 
699 	pset_node_t node = &pset_node0;
700 	processor_set_t pset = node->psets;
701 
702 	spl_t s = splsched();
703 	do {
704 		while (pset != NULL) {
705 			pset_lock(pset);
706 
707 			bitmap_t *map = pset->rt_runq.bitmap;
708 			for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
709 				rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
710 
711 				qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
712 					if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
713 						scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
714 					}
715 				}
716 			}
717 
718 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PSET_BITMASKS),
719 			    pset->pset_id,
720 			    pset->recommended_bitmask,
721 			    pset->perfcontrol_cpu_migration_bitmask,
722 			    pset->perfcontrol_cpu_preferred_bitmask);
723 
724 			pset_unlock(pset);
725 
726 			pset = pset->pset_list;
727 		}
728 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
729 	splx(s);
730 }
731 
732 int64_t
sched_rt_runq_count_sum(void)733 sched_rt_runq_count_sum(void)
734 {
735 	pset_node_t node = &pset_node0;
736 	processor_set_t pset = node->psets;
737 	int64_t count = 0;
738 
739 	do {
740 		while (pset != NULL) {
741 			count += pset->rt_runq.runq_stats.count_sum;
742 
743 			pset = pset->pset_list;
744 		}
745 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
746 
747 	return count;
748 }
749 
750 #pragma mark - Utilities
751 
752 uint64_t
rt_deadline_add(uint64_t d,uint64_t e)753 rt_deadline_add(uint64_t d, uint64_t e)
754 {
755 	uint64_t sum;
756 	return os_add_overflow(d, e, &sum) ? RT_DEADLINE_NONE : sum;
757 }
758 
759 cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)760 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
761 {
762 	cpumap_t avail_map = pset_available_cpumap(pset);
763 #if CONFIG_SCHED_SMT
764 	if (!sched_allow_rt_smt) {
765 		/*
766 		 * Secondary CPUs are not allowed to run RT threads, so
767 		 * only primary CPUs should be included
768 		 */
769 		avail_map &= pset->primary_map;
770 	}
771 #endif /* CONFIG_SCHED_SMT */
772 
773 	return avail_map & ~pset->realtime_map;
774 }
775 
776 /* pset is locked */
777 static processor_t
pset_choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)778 pset_choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
779 {
780 	(void) consider_secondaries;
781 	bool skip_spills = true;
782 	bool include_ast_urgent_pending_cpus = false;
783 
784 #if CONFIG_SCHED_SMT
785 	processor_t next_processor = pset_choose_processor_for_realtime_thread_smt(pset, skip_processor, consider_secondaries, skip_spills);
786 #else /* CONFIG_SCHED_SMT */
787 	processor_t next_processor = pset_choose_processor_for_realtime_thread(pset, skip_processor, skip_spills);
788 #endif /* CONFIG_SCHED_SMT */
789 	if (next_processor != PROCESSOR_NULL) {
790 		return next_processor;
791 	}
792 
793 	next_processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
794 	return next_processor;
795 }
796 
797 #if CONFIG_SCHED_EDGE
798 /*
799  * Realtime Steal Utilities
800  *
801  * Realtime steal is only supported on platforms with the edge scheduler.
802  */
803 
804 /* Update realtime stealable state. */
805 void
pset_update_rt_stealable_state(processor_set_t pset)806 pset_update_rt_stealable_state(processor_set_t pset)
807 {
808 	pset_assert_locked(pset);
809 	if (rt_pset_has_stealable_threads(pset)) {
810 		os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, rt_runq_earliest_deadline(pset), relaxed);
811 	} else {
812 		os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE, relaxed);
813 	}
814 }
815 
816 bool
rt_pset_has_stealable_threads(processor_set_t pset)817 rt_pset_has_stealable_threads(processor_set_t pset)
818 {
819 	cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
820 
821 	return rt_runq_count(pset) > bit_count(avail_map);
822 }
823 
824 /*
825  * Returns the next processor to IPI for a migrating realtime thread. Realtime
826  * spill is only supported with the edge scheduler.
827  *
828  * Expects starting_pset to be locked. Returns false if starting_pset was never
829  * unlocked; otherwise, returns true with no lock held.
830  */
831 bool
rt_choose_next_processor_for_spill_IPI(processor_set_t starting_pset,processor_t chosen_processor,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)832 rt_choose_next_processor_for_spill_IPI(
833 	processor_set_t  starting_pset,
834 	processor_t      chosen_processor,
835 	processor_t      *result_processor,
836 	sched_ipi_type_t *result_ipi_type
837 	)
838 {
839 	assert3p(starting_pset, !=, PROCESSOR_SET_NULL);
840 	assert3p(chosen_processor, !=, PROCESSOR_NULL);
841 
842 	uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
843 	int max_pri = rt_runq_priority(starting_pset);
844 	__kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
845 	processor_set_t pset = starting_pset; /* lock is held on this pset */
846 	processor_t next_rt_processor = PROCESSOR_NULL;
847 	/* Optimization so caller can avoid unnecessary lock-takes if there are no psets eligible for spill: */
848 	bool starting_pset_was_unlocked = false;
849 
850 	cpumap_t candidate_map = ~BIT(starting_pset->pset_id); /* exclude stealing_pset */
851 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
852 	while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, candidate_map, &istate)) {
853 		assert3u(starting_pset->pset_id, !=, istate.spis_pset_id);
854 		/* Check for null pset because we do not use acquire/release semantics for spill order. */
855 		processor_set_t nset = pset_array[istate.spis_pset_id];
856 		if (__improbable(nset == PROCESSOR_SET_NULL)) {
857 			continue;
858 		}
859 
860 		/* Make sure the pset is allowed to steal threads from stealing_pset's runqueue. */
861 		sched_clutch_edge edge = sched_rt_config_get((pset_id_t) starting_pset->pset_id, (pset_id_t) istate.spis_pset_id);
862 		if (istate.spis_pset_id != starting_pset->pset_id && edge.sce_steal_allowed == false) {
863 			continue;
864 		}
865 		pset = change_locked_pset(pset, nset);
866 		starting_pset_was_unlocked = true;
867 
868 		next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true);
869 		if (next_rt_processor != PROCESSOR_NULL) {
870 			break;
871 		}
872 	}
873 
874 	if (next_rt_processor != PROCESSOR_NULL) {
875 		if (pset != starting_pset) {
876 			if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
877 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
878 				    next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, spill_tid);
879 			}
880 		}
881 		*result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
882 		*result_processor = next_rt_processor;
883 	}
884 	if (starting_pset_was_unlocked) {
885 		pset_unlock(pset);
886 		return true;
887 	} else {
888 		return false;
889 	}
890 }
891 #endif /* CONFIG_SCHED_EDGE */
892 
893 bool
rt_pset_needs_a_followup_IPI(processor_set_t pset)894 rt_pset_needs_a_followup_IPI(processor_set_t pset)
895 {
896 	int nbackup_cpus = 0;
897 
898 	if (rt_runq_is_low_latency(pset)) {
899 		nbackup_cpus = sched_rt_n_backup_processors;
900 	}
901 
902 	int rt_rq_count = rt_runq_count(pset);
903 
904 	return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
905 }
906 
907 /*
908  * Returns the next processor to IPI as a followup for low-latency realtime
909  * threads on the runqueue.
910  *
911  * pset should be locked, and stays locked the whole time.
912  */
913 void
rt_choose_next_processor_for_followup_IPI(processor_set_t pset,processor_t chosen_processor,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)914 rt_choose_next_processor_for_followup_IPI(
915 	processor_set_t pset,
916 	processor_t chosen_processor,
917 	processor_t *result_processor,
918 	sched_ipi_type_t *result_ipi_type)
919 {
920 	uint64_t earliest_deadline = rt_runq_earliest_deadline(pset);
921 	int max_pri = rt_runq_priority(pset);
922 	processor_t next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true);
923 	if (next_rt_processor != PROCESSOR_NULL) {
924 		*result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
925 		*result_processor = next_rt_processor;
926 	}
927 }
928 
929 #if CONFIG_SCHED_SMT
930 extern int sched_avoid_cpu0;
931 extern int sched_allow_rt_smt;
932 
933 /* pset is locked */
934 processor_t
pset_choose_processor_for_realtime_thread_smt(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)935 pset_choose_processor_for_realtime_thread_smt(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
936 {
937 #if defined(__x86_64__)
938 	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
939 #else
940 	const bool avoid_cpu0 = false;
941 #endif
942 	cpumap_t cpu_map;
943 
944 try_again:
945 	cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
946 	if (skip_processor) {
947 		bit_clear(cpu_map, skip_processor->cpu_id);
948 	}
949 	if (skip_spills) {
950 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
951 	}
952 
953 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
954 		bit_clear(cpu_map, 0);
955 	}
956 
957 	cpumap_t primary_map = cpu_map & pset->primary_map;
958 	if (avoid_cpu0) {
959 		primary_map = bit_ror64(primary_map, 1);
960 	}
961 
962 	int rotid = lsb_first(primary_map);
963 	if (rotid >= 0) {
964 		int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
965 
966 		processor_t processor = processor_array[cpuid];
967 
968 		return processor;
969 	}
970 
971 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
972 		goto out;
973 	}
974 
975 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
976 		/* Also avoid cpu1 */
977 		bit_clear(cpu_map, 1);
978 	}
979 
980 	/* Consider secondary processors whose primary is actually running a realtime thread */
981 	cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
982 	if (avoid_cpu0) {
983 		/* Also avoid cpu1 */
984 		secondary_map = bit_ror64(secondary_map, 2);
985 	}
986 	rotid = lsb_first(secondary_map);
987 	if (rotid >= 0) {
988 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
989 
990 		processor_t processor = processor_array[cpuid];
991 
992 		return processor;
993 	}
994 
995 	/* Consider secondary processors */
996 	secondary_map = cpu_map & ~pset->primary_map;
997 	if (avoid_cpu0) {
998 		/* Also avoid cpu1 */
999 		secondary_map = bit_ror64(secondary_map, 2);
1000 	}
1001 	rotid = lsb_first(secondary_map);
1002 	if (rotid >= 0) {
1003 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
1004 
1005 		processor_t processor = processor_array[cpuid];
1006 
1007 		return processor;
1008 	}
1009 
1010 	/*
1011 	 * I was hoping the compiler would optimize
1012 	 * this away when avoid_cpu0 is const bool false
1013 	 * but it still complains about the assignmnent
1014 	 * in that case.
1015 	 */
1016 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
1017 #if defined(__x86_64__)
1018 		avoid_cpu0 = false;
1019 #else
1020 		assert(0);
1021 #endif
1022 		goto try_again;
1023 	}
1024 
1025 out:
1026 	if (skip_processor) {
1027 		return PROCESSOR_NULL;
1028 	}
1029 
1030 	/*
1031 	 * If we didn't find an obvious processor to choose, but there are still more CPUs
1032 	 * not already running realtime threads than realtime threads in the realtime run queue,
1033 	 * this thread belongs in this pset, so choose some other processor in this pset
1034 	 * to ensure the thread is enqueued here.
1035 	 */
1036 	cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
1037 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
1038 		cpu_map = non_realtime_map;
1039 		assert(cpu_map != 0);
1040 		int cpuid = bit_first(cpu_map);
1041 		assert(cpuid >= 0);
1042 		return processor_array[cpuid];
1043 	}
1044 
1045 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
1046 		goto skip_secondaries;
1047 	}
1048 
1049 	non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
1050 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
1051 		cpu_map = non_realtime_map;
1052 		assert(cpu_map != 0);
1053 		int cpuid = bit_first(cpu_map);
1054 		assert(cpuid >= 0);
1055 		return processor_array[cpuid];
1056 	}
1057 
1058 skip_secondaries:
1059 	return PROCESSOR_NULL;
1060 }
1061 #else /* !CONFIG_SCHED_SMT*/
1062 /* pset is locked */
1063 processor_t
pset_choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool skip_spills)1064 pset_choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills)
1065 {
1066 	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
1067 	if (skip_processor) {
1068 		bit_clear(cpu_map, skip_processor->cpu_id);
1069 	}
1070 	if (skip_spills) {
1071 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
1072 	}
1073 
1074 	int rotid = lsb_first(cpu_map);
1075 	if (rotid >= 0) {
1076 		return processor_array[rotid];
1077 	}
1078 
1079 	/*
1080 	 * If we didn't find an obvious processor to choose, but there are still more CPUs
1081 	 * not already running realtime threads than realtime threads in the realtime run queue,
1082 	 * this thread belongs in this pset, so choose some other processor in this pset
1083 	 * to ensure the thread is enqueued here.
1084 	 */
1085 	cpumap_t non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
1086 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
1087 		cpu_map = non_realtime_map;
1088 		assert(cpu_map != 0);
1089 		int cpuid = bit_first(cpu_map);
1090 		assert(cpuid >= 0);
1091 		return processor_array[cpuid];
1092 	}
1093 
1094 	return PROCESSOR_NULL;
1095 }
1096 #endif /* !CONFIG_SCHED_SMT */
1097 
1098 /*
1099  * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
1100  * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
1101  *
1102  * pset is locked.
1103  */
1104 processor_t
pset_choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)1105 pset_choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
1106 {
1107 	uint64_t  furthest_deadline = rt_deadline_add(minimum_deadline, rt_deadline_epsilon);
1108 	processor_t fd_processor = PROCESSOR_NULL;
1109 	int lowest_priority = max_pri;
1110 
1111 	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
1112 	if (skip_processor) {
1113 		bit_clear(cpu_map, skip_processor->cpu_id);
1114 	}
1115 	if (skip_spills) {
1116 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
1117 	}
1118 
1119 	for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
1120 		processor_t processor = processor_array[cpuid];
1121 
1122 		if (processor->current_pri > lowest_priority) {
1123 			continue;
1124 		}
1125 
1126 		if (processor->current_pri < lowest_priority) {
1127 			lowest_priority = processor->current_pri;
1128 			furthest_deadline = processor->deadline;
1129 			fd_processor = processor;
1130 			continue;
1131 		}
1132 
1133 		if (processor->deadline > furthest_deadline) {
1134 			furthest_deadline = processor->deadline;
1135 			fd_processor = processor;
1136 		}
1137 	}
1138 
1139 	if (fd_processor) {
1140 		return fd_processor;
1141 	}
1142 
1143 	/*
1144 	 * There is a race condition possible when there are multiple processor sets.
1145 	 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
1146 	 * so it drops pset lock A and tries to take pset lock B.  Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
1147 	 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
1148 	 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
1149 	 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
1150 	 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
1151 	 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
1152 	 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
1153 	 *
1154 	 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
1155 	 * on the run queue of that pset.
1156 	 */
1157 	if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
1158 		cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
1159 		assert(skip_processor == PROCESSOR_NULL);
1160 		assert(skip_spills == false);
1161 
1162 		for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
1163 			processor_t processor = processor_array[cpuid];
1164 
1165 			if (processor->current_pri > lowest_priority) {
1166 				continue;
1167 			}
1168 
1169 			if (processor->current_pri < lowest_priority) {
1170 				lowest_priority = processor->current_pri;
1171 				furthest_deadline = processor->deadline;
1172 				fd_processor = processor;
1173 				continue;
1174 			}
1175 
1176 			if (processor->deadline > furthest_deadline) {
1177 				furthest_deadline = processor->deadline;
1178 				fd_processor = processor;
1179 			}
1180 		}
1181 	}
1182 
1183 	return fd_processor;
1184 }
1185 
1186 bool
rt_clear_pending_spill(processor_t processor,int reason)1187 rt_clear_pending_spill(processor_t processor, int reason)
1188 {
1189 	processor_set_t pset = processor->processor_set;
1190 	if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
1191 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, reason);
1192 		return true;
1193 	} else {
1194 		return false;
1195 	}
1196 }
1197 
1198 #pragma mark - Realtime Runqueues
1199 
1200 #if DEBUG || SCHED_TEST_HARNESS
1201 void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)1202 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
1203 {
1204 	bitmap_t *map = rt_run_queue->bitmap;
1205 
1206 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
1207 	uint32_t constraint = RT_CONSTRAINT_NONE;
1208 	int ed_index = NOPRI;
1209 	int count = 0;
1210 	bool found_thread = false;
1211 
1212 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
1213 		int i = pri - BASEPRI_RTQUEUES;
1214 		rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1215 		queue_t queue = &rt_runq->pri_queue;
1216 		queue_entry_t iter;
1217 		int n = 0;
1218 		uint64_t previous_deadline = 0;
1219 		qe_foreach(iter, queue) {
1220 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
1221 			assert_thread_magic(iter_thread);
1222 			if (iter_thread == thread) {
1223 				found_thread = true;
1224 			}
1225 			assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
1226 			assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
1227 			assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
1228 			assert(previous_deadline <= iter_thread->realtime.deadline);
1229 			n++;
1230 			if (iter == queue_first(queue)) {
1231 				assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
1232 				assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
1233 			}
1234 			previous_deadline = iter_thread->realtime.deadline;
1235 		}
1236 		assert(n == rt_runq->pri_count);
1237 		if (n == 0) {
1238 			assert(bitmap_test(map, i) == false);
1239 			assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
1240 			assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
1241 		} else {
1242 			assert(bitmap_test(map, i) == true);
1243 		}
1244 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
1245 			earliest_deadline = rt_runq->pri_earliest_deadline;
1246 			constraint = rt_runq->pri_constraint;
1247 			ed_index = i;
1248 		}
1249 		count += n;
1250 	}
1251 	assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
1252 	assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
1253 	assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
1254 	assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
1255 	if (thread) {
1256 		assert(found_thread);
1257 	}
1258 }
1259 #endif /* DEBUG || SCHED_TEST_HARNESS */
1260 
1261 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)1262 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
1263 {
1264 	int pri = thread->sched_pri;
1265 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
1266 	int i = pri - BASEPRI_RTQUEUES;
1267 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1268 	bitmap_t *map = rt_run_queue->bitmap;
1269 
1270 	bitmap_set(map, i);
1271 
1272 	queue_t     queue       = &rt_runq->pri_queue;
1273 	uint64_t    deadline    = thread->realtime.deadline;
1274 	bool        preempt     = false;
1275 	bool        earliest    = false;
1276 
1277 	if (queue_empty(queue)) {
1278 		enqueue_tail(queue, &thread->runq_links);
1279 		preempt = true;
1280 		earliest = true;
1281 		rt_runq->pri_earliest_deadline = deadline;
1282 		rt_runq->pri_constraint = thread->realtime.constraint;
1283 	} else {
1284 		/* Insert into rt_runq in thread deadline order */
1285 		queue_entry_t iter;
1286 		qe_foreach(iter, queue) {
1287 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
1288 			assert_thread_magic(iter_thread);
1289 
1290 			if (deadline < iter_thread->realtime.deadline) {
1291 				if (iter == queue_first(queue)) {
1292 					preempt = true;
1293 					earliest = true;
1294 					rt_runq->pri_earliest_deadline = deadline;
1295 					rt_runq->pri_constraint = thread->realtime.constraint;
1296 				}
1297 				insque(&thread->runq_links, queue_prev(iter));
1298 				break;
1299 			} else if (iter == queue_last(queue)) {
1300 				enqueue_tail(queue, &thread->runq_links);
1301 				break;
1302 			}
1303 		}
1304 	}
1305 	if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
1306 		os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
1307 		os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
1308 		os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
1309 	}
1310 
1311 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
1312 	rt_runq->pri_count++;
1313 	os_atomic_inc(&rt_run_queue->count, relaxed);
1314 
1315 	thread_set_runq_locked(thread, processor);
1316 
1317 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
1318 
1319 	return preempt;
1320 }
1321 
1322 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)1323 rt_runq_earliest_deadline(processor_set_t pset)
1324 {
1325 	return os_atomic_load_wide(&pset->rt_runq.earliest_deadline, relaxed);
1326 }
1327 
1328 /*
1329  *	rt_runq_insert:
1330  *
1331  *	Enqueue a thread for realtime execution.
1332  */
1333 bool
rt_runq_insert(processor_t processor,processor_set_t pset,thread_t thread)1334 rt_runq_insert(processor_t processor, processor_set_t pset, thread_t thread)
1335 {
1336 	pset_assert_locked(pset);
1337 
1338 	bool preempt = rt_runq_enqueue(&pset->rt_runq, thread, processor);
1339 	pset_update_rt_stealable_state(pset);
1340 
1341 	return preempt;
1342 }
1343 
1344 int
rt_runq_count(processor_set_t pset)1345 rt_runq_count(processor_set_t pset)
1346 {
1347 	return os_atomic_load(&pset->rt_runq.count, relaxed);
1348 }
1349 
1350 int
rt_runq_priority(processor_set_t pset)1351 rt_runq_priority(processor_set_t pset)
1352 {
1353 	pset_assert_locked(pset);
1354 	rt_queue_t rt_run_queue = &pset->rt_runq;
1355 
1356 	bitmap_t *map = rt_run_queue->bitmap;
1357 	int i = bitmap_first(map, NRTQS);
1358 	assert(i < NRTQS);
1359 
1360 	if (i >= 0) {
1361 		return i + BASEPRI_RTQUEUES;
1362 	}
1363 
1364 	return i;
1365 }
1366 
1367 bool
rt_runq_is_low_latency(processor_set_t pset)1368 rt_runq_is_low_latency(processor_set_t pset)
1369 {
1370 	return os_atomic_load(&pset->rt_runq.constraint, relaxed) <= rt_constraint_threshold;
1371 }
1372 
1373 thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)1374 rt_runq_dequeue(rt_queue_t rt_run_queue)
1375 {
1376 	bitmap_t *map = rt_run_queue->bitmap;
1377 	int i = bitmap_first(map, NRTQS);
1378 	assert((i >= 0) && (i < NRTQS));
1379 
1380 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1381 
1382 	if (!sched_rt_runq_strict_priority) {
1383 		int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
1384 		if (ed_index != i) {
1385 			assert((ed_index >= 0) && (ed_index < NRTQS));
1386 			rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
1387 
1388 			thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
1389 			thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1390 
1391 			if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
1392 				/* choose the earliest deadline thread */
1393 				rt_runq = ed_runq;
1394 				i = ed_index;
1395 			}
1396 		}
1397 	}
1398 
1399 	assert(rt_runq->pri_count > 0);
1400 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
1401 	uint32_t constraint = RT_CONSTRAINT_NONE;
1402 	int ed_index = NOPRI;
1403 	thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
1404 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
1405 	if (--rt_runq->pri_count > 0) {
1406 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1407 		assert(next_rt != THREAD_NULL);
1408 		earliest_deadline = next_rt->realtime.deadline;
1409 		constraint = next_rt->realtime.constraint;
1410 		ed_index = i;
1411 	} else {
1412 		bitmap_clear(map, i);
1413 	}
1414 	rt_runq->pri_earliest_deadline = earliest_deadline;
1415 	rt_runq->pri_constraint = constraint;
1416 
1417 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
1418 		rt_runq = &rt_run_queue->rt_queue_pri[i];
1419 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
1420 			earliest_deadline = rt_runq->pri_earliest_deadline;
1421 			constraint = rt_runq->pri_constraint;
1422 			ed_index = i;
1423 		}
1424 	}
1425 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
1426 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
1427 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
1428 	os_atomic_dec(&rt_run_queue->count, relaxed);
1429 
1430 	thread_clear_runq(new_thread);
1431 
1432 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
1433 
1434 	return new_thread;
1435 }
1436 
1437 thread_t
rt_runq_first(rt_queue_t rt_run_queue)1438 rt_runq_first(rt_queue_t rt_run_queue)
1439 {
1440 	bitmap_t *map = rt_run_queue->bitmap;
1441 	int i = bitmap_first(map, NRTQS);
1442 	if (i < 0) {
1443 		return THREAD_NULL;
1444 	}
1445 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1446 	thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1447 
1448 	return next_rt;
1449 }
1450 
1451 void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)1452 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
1453 {
1454 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
1455 
1456 	int pri = thread->sched_pri;
1457 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
1458 	int i = pri - BASEPRI_RTQUEUES;
1459 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
1460 	bitmap_t *map = rt_run_queue->bitmap;
1461 
1462 	assert(rt_runq->pri_count > 0);
1463 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
1464 	uint32_t constraint = RT_CONSTRAINT_NONE;
1465 	int ed_index = NOPRI;
1466 	remqueue(&thread->runq_links);
1467 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
1468 	if (--rt_runq->pri_count > 0) {
1469 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
1470 		earliest_deadline = next_rt->realtime.deadline;
1471 		constraint = next_rt->realtime.constraint;
1472 		ed_index = i;
1473 	} else {
1474 		bitmap_clear(map, i);
1475 	}
1476 	rt_runq->pri_earliest_deadline = earliest_deadline;
1477 	rt_runq->pri_constraint = constraint;
1478 
1479 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
1480 		rt_runq = &rt_run_queue->rt_queue_pri[i];
1481 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
1482 			earliest_deadline = rt_runq->pri_earliest_deadline;
1483 			constraint = rt_runq->pri_constraint;
1484 			ed_index = i;
1485 		}
1486 	}
1487 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
1488 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
1489 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
1490 	os_atomic_dec(&rt_run_queue->count, relaxed);
1491 
1492 	thread_clear_runq_locked(thread);
1493 
1494 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
1495 }
1496