sched_rt.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8) - OpenGrok cross reference for /xnu-12377.61.12/osfmk/kern/sched_rt.c

/*
 * Copyright (c) 2024 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <stdint.h>

#include <kern/bits.h>
#include <kern/kern_types.h>
#include <kern/processor.h>
#include <kern/sched_clutch.h>
#include <kern/sched_common.h>
#include <kern/sched_prim.h>
#include <kern/sched_rt.h>
#include <kern/thread.h>
#include <kern/queue.h>

#include <sys/kdebug_kernel.h>

#include <os/atomic_private.h>

#include <machine/machine_routines.h>

#ifdef KDBG_MACOS_RELEASE
#define KTRC KDBG_MACOS_RELEASE
#else
#define KTRC KDBG_RELEASE
#endif

#pragma mark - Constants and Tunables

#if (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS)
#include <kern/startup.h>

/*
 * Tunables controlling how xnu initializes the realtime matrix. CLPC can
 * override their effects with sched_perfcontrol interfaces.
 */

TUNABLE(unsigned int, sched_rt_spill_policy, "sched_rt_spill_policy", 1);

TUNABLE(unsigned, sched_rt_steal_policy, "sched_rt_steal_policy", 2);
#endif /* (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS) */

uint32_t rt_deadline_epsilon;
uint32_t rt_constraint_threshold;
/* epsilon for comparing RT deadlines */
int rt_deadline_epsilon_us = 100;
uint32_t max_rt_quantum;
uint32_t min_rt_quantum;
int sched_allow_rt_smt = 1;
int sched_rt_runq_strict_priority = false;

int
sched_get_rt_deadline_epsilon(void)
{
	return rt_deadline_epsilon_us;
}

void
sched_set_rt_deadline_epsilon(int new_epsilon_us)
{
	rt_deadline_epsilon_us = new_epsilon_us;

	uint64_t abstime;
	clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
	assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
	rt_deadline_epsilon = (uint32_t)abstime;
}

#pragma mark - Initialization

static int sched_rt_max_clusters = 0;

void
sched_realtime_timebase_init(void)
{
	uint64_t abstime;

	/* smallest rt computation (50 us) */
	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	min_rt_quantum = (uint32_t)abstime;

	/* maximum rt computation (50 ms) */
	clock_interval_to_absolutetime_interval(
		50, 1000 * NSEC_PER_USEC, &abstime);
	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	max_rt_quantum = (uint32_t)abstime;

	/* constraint threshold for sending backup IPIs (4 ms) */
	clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	rt_constraint_threshold = (uint32_t)abstime;

	/* epsilon for comparing deadlines */
	sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
}

#if CONFIG_SCHED_EDGE
/* forward-declare config utility */
static void
sched_rt_config_pset_push(processor_set_t pset);
#endif /* CONFIG_SCHED_EDGE */

static void
rt_init_completed(void)
{
	/* This should be unified with sched_edge_max_clusters and moved to a common location. <rdar://145162647> */
	sched_rt_max_clusters = ml_get_cluster_count();

	/* Realtime spill/steal are only supported on platforms with the edge scheduler. */
#if CONFIG_SCHED_EDGE
	/* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */
	spl_t s = splsched();
	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
	for (int src_cluster_id = 0; src_cluster_id < sched_rt_max_clusters; src_cluster_id++) {
		processor_set_t src_pset = pset_array[src_cluster_id];
		assert3p(src_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */

		/* For each cluster, set all its outgoing edge parameters */
		for (int dst_cluster_id = 0; dst_cluster_id < sched_rt_max_clusters; dst_cluster_id++) {
			if (dst_cluster_id == src_cluster_id) {
				continue;
			}
			processor_set_t dst_pset = pset_array[dst_cluster_id];
			assert3p(dst_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */

			bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type);
			if (clusters_homogenous) {
				/* Default realtime policy: spill allowed among homogeneous psets. */
				sched_rt_config_set((pset_id_t) src_cluster_id, (pset_id_t) dst_cluster_id, (sched_clutch_edge) {
					.sce_migration_allowed = true,
					.sce_steal_allowed = true,
					.sce_migration_weight = 0,
				});
			} else {
				/* Default realtime policy: disallow spill among heterogeneous psets. */
				sched_rt_config_set((pset_id_t) src_cluster_id, (pset_id_t) dst_cluster_id, (sched_clutch_edge) {
					.sce_migration_allowed = false,
					.sce_steal_allowed = false,
					.sce_migration_weight = 0,
				});
			}
		}
	}


	for (pset_id_t pset_id = 0; pset_id < sched_rt_max_clusters; pset_id++) {
		sched_rt_config_pset_push(pset_array[pset_id]);
	}

	simple_unlock(&sched_available_cores_lock);
	splx(s);
#endif /* CONFIG_SCHED_EDGE */
}

static void
pset_rt_init(processor_set_t pset)
{
	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
		int i = pri - BASEPRI_RTQUEUES;
		rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
		queue_init(&rqi->pri_queue);
		rqi->pri_count = 0;
		rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
		rqi->pri_constraint = RT_CONSTRAINT_NONE;
	}
	os_atomic_init(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE);

	rt_queue_t rt_runq = &pset->rt_runq;
	os_atomic_init(&rt_runq->count, 0);
	os_atomic_init(&rt_runq->earliest_deadline, RT_DEADLINE_NONE);
	os_atomic_init(&rt_runq->constraint, RT_CONSTRAINT_NONE);
	os_atomic_init(&rt_runq->ed_index, NOPRI);
	bzero(&rt_runq->bitmap, sizeof(rt_runq->bitmap));
	bzero(&rt_runq->runq_stats, sizeof(rt_runq->runq_stats));

#if __AMP__
	/*
	 * Initialize spill/steal search orders as invalid to prevent spill/steal
	 * before the matrix is configured.
	 */
	bzero(pset->sched_rt_edges, sizeof(pset->sched_rt_edges));
	for (pset_id_t i = 0; i < MAX_PSETS - 1; i++) {
		pset->sched_rt_spill_search_order.spso_search_order[i] = PSET_ID_INVALID;
#if CONFIG_SCHED_EDGE
		pset->sched_rt_steal_search_order.spso_search_order[i] = PSET_ID_INVALID;
#endif /* CONFIG_SCHED_EDGE */
	}
#endif /* __AMP__ */
}

#pragma mark - Realtime Scheduler/CLPC interface

#if CONFIG_SCHED_EDGE
void
sched_rt_config_set(
	uint8_t src_pset,
	uint8_t dst_pset,
	sched_clutch_edge edge_config)
{
	assert(src_pset != dst_pset || !edge_config.sce_migration_allowed); /* No self-edges. */
	os_atomic_store(&pset_array[src_pset]->sched_rt_edges[dst_pset], edge_config, relaxed);
}

sched_clutch_edge
sched_rt_config_get(
	uint8_t src_pset,
	uint8_t dst_pset)
{
	return os_atomic_load(&pset_array[src_pset]->sched_rt_edges[dst_pset], relaxed);
}

void
sched_rt_matrix_get(
	sched_clutch_edge *edge_matrix,
	bool *edge_requests,
	uint64_t num_psets)
{
	uint64_t edge_index = 0;
	for (uint8_t src_pset = 0; src_pset < num_psets; src_pset++) {
		for (uint8_t dst_pset = 0; dst_pset < num_psets; dst_pset++) {
			if (edge_requests[edge_index]) {
				edge_matrix[edge_index] = sched_rt_config_get(src_pset, dst_pset);
			}
			edge_index++;
		}
	}
}

/*
 * sched_rt_config_pset_push()
 *
 * After using sched_rt_config_set() to update edge tunables outgoing from a particular source
 * pset, this function should be called in order to propagate the updates to derived metadata for
 * the pset, such as search orders for outgoing spill and steal.
 */
static void
sched_rt_config_pset_push(processor_set_t pset)
{
	assert3u(pset->pset_id, <, UINT8_MAX);

	sched_pset_search_order_sort_data_t spill_datas[MAX_PSETS - 1], steal_datas[MAX_PSETS - 1];
	uint num_spill_datas = 0, num_steal_datas = 0;
	for (pset_id_t other_pset_id = 0; other_pset_id < sched_rt_max_clusters; other_pset_id++) {
		if (pset->pset_id == other_pset_id) {
			continue; /* No self-edges. */
		}
		/* Spill */
		sched_clutch_edge out_edge = sched_rt_config_get((pset_id_t)pset->pset_cluster_id, other_pset_id);
		if (out_edge.sce_migration_allowed) {
			spill_datas[num_spill_datas++] = (sched_pset_search_order_sort_data_t) {
				.spsosd_src_pset = pset,
				.spsosd_migration_weight = out_edge.sce_migration_weight,
				.spsosd_dst_pset_id = other_pset_id
			};
		}
		/* Steal */
		sched_clutch_edge in_edge = sched_rt_config_get(other_pset_id, (pset_id_t)pset->pset_cluster_id);
		if (in_edge.sce_steal_allowed) {
			steal_datas[num_steal_datas++] = (sched_pset_search_order_sort_data_t) {
				.spsosd_src_pset = pset,
				.spsosd_migration_weight = in_edge.sce_migration_weight,
				.spsosd_dst_pset_id = other_pset_id,
			};
		}
	}
	sched_pset_search_order_compute(&pset->sched_rt_spill_search_order, spill_datas, num_spill_datas, sched_edge_search_order_weight_then_locality_cmp);
	sched_pset_search_order_compute(&pset->sched_rt_steal_search_order, steal_datas, num_steal_datas, sched_edge_search_order_weight_then_locality_cmp);
}

void
sched_rt_matrix_set(
	sched_clutch_edge *rt_matrix,
	bool *edge_changes,
	uint64_t num_psets)
{
	/* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */
	spl_t s = splsched();
	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);

	for (uint8_t src_pset_id = 0; src_pset_id < num_psets; src_pset_id++) {
		for (uint8_t dst_pset_id = 0; dst_pset_id < num_psets; dst_pset_id++) {
			const uint64_t rt_matrix_index = src_pset_id * num_psets + dst_pset_id;
			if (edge_changes[rt_matrix_index]) {
				sched_rt_config_set(src_pset_id, dst_pset_id, rt_matrix[rt_matrix_index]);
			}
		}
	}

	for (pset_id_t pset_id = 0; pset_id < num_psets; pset_id++) {
		sched_rt_config_pset_push(pset_array[pset_id]);
	}

	simple_unlock(&sched_available_cores_lock);
	splx(s);
}
#endif /* CONFIG_SCHED_EDGE */

#pragma mark - Scheduler Callouts

#if CONFIG_SCHED_SMT
/*
 * SMT-aware callout for rt_choose_processor.
 */
processor_t
sched_rtlocal_choose_processor_smt(
	processor_set_t         starting_pset,
	processor_t             processor,
	thread_t                thread)
{
	processor_set_t nset = PROCESSOR_SET_NULL;
	processor_set_t pset = starting_pset;
	pset_node_t node = pset->node;

	processor_t lc_processor = processor;
	integer_t lowest_count = INT_MAX;
	if (lc_processor != PROCESSOR_NULL) {
		lowest_count = SCHED(processor_runq_count)(processor);
	}

	bool include_ast_urgent_pending_cpus = false;
	cpumap_t ast_urgent_pending;
try_again:
	ast_urgent_pending = 0;
	int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
	for (; consider_secondaries < 2; consider_secondaries++) {
		pset = change_locked_pset(pset, starting_pset);
		do {
			cpumap_t available_map = pset_available_cpumap(pset);
			if (available_map == 0) {
				goto no_available_cpus;
			}

			processor = pset_choose_processor_for_realtime_thread_smt(pset, PROCESSOR_NULL, consider_secondaries, false);
			if (processor) {
				return processor;
			}

			if (consider_secondaries) {
				processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
				if (processor) {
					/*
					 * Instead of looping through all the psets to find the global
					 * furthest deadline processor, preempt the first candidate found.
					 * The preempted thread will then find any other available far deadline
					 * processors to preempt.
					 */
					return processor;
				}

				ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;

				if (rt_runq_count(pset) < lowest_count) {
					int cpuid = bit_first(available_map);
					assert(cpuid >= 0);
					lc_processor = processor_array[cpuid];
					lowest_count = rt_runq_count(pset);
				}
			}

no_available_cpus:
			nset = next_pset(pset);

			if (nset != starting_pset) {
				pset = change_locked_pset(pset, nset);
			}
		} while (nset != starting_pset);
	}

	/* Short cut for single pset nodes */
	if (bit_count(node->pset_map) == 1) {
		if (lc_processor) {
			pset_assert_locked(lc_processor->processor_set);
			return lc_processor;
		}
	} else {
		if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
			/* See the comment in pset_choose_furthest_deadline_processor_for_realtime_thread() */
			include_ast_urgent_pending_cpus = true;
			goto try_again;
		}
	}

	processor = lc_processor;

	if (processor) {
		pset = change_locked_pset(pset, processor->processor_set);
		/* Check that chosen processor is still usable */
		cpumap_t available_map = pset_available_cpumap(pset);
		if (bit_test(available_map, processor->cpu_id)) {
			return processor;
		}

		/* processor is no longer usable */
		processor = PROCESSOR_NULL;
	}

	pset_assert_locked(pset);
	pset_unlock(pset);
	return PROCESSOR_NULL;
}
#else /* !CONFIG_SCHED_SMT */
/*
 * Called with thread and starting_pset locked. The returned processor's pset is
 * locked on return.
 */
processor_t
sched_rt_choose_processor(
	const processor_set_t starting_pset,
	processor_t processor,
	thread_t thread)
{
	assert3u(thread->sched_pri, >=, BASEPRI_RTQUEUES);
	assert3u(thread->sched_pri, <=, MAXPRI);

	/*
	 * In choose_starting_pset, we found a good candidate pset for this thread.
	 * Now, we pick the best processor to preempt, if there is one.  It is also
	 * possible that conditions have changed and the thread should spill to
	 * another pset.
	 */

	processor_set_t pset = starting_pset; /* Lock is held on this pset. */
	pset_assert_locked(pset);

#if __AMP__
	/*
	 * If there are processors with outstanding urgent preemptions, we consider
	 * them in a second pass. While we are changing pset locks here, it is
	 * possible a processor may resolve its outstanding urgent preemption and
	 * become eligible to run this thread. See comment in
	 * pset_choose_furthest_deadline_processor_for_realtime_thread().
	 */
	bool found_ast_urgent_pending = false; /* Tracks whether any (eligible) processors have pending urgent ASTs. */
	for (int include_ast_urgent_pending_cpus = 0; include_ast_urgent_pending_cpus < 2; include_ast_urgent_pending_cpus++) {
		if (include_ast_urgent_pending_cpus && !found_ast_urgent_pending) {
			break; /* Skip the second pass. */
		}

		sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
		while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, ~0, &istate)) {
			/* Switch to the next pset. We need to check for null psets because
			 * we do not use acquire/release semantics for the spill order. */
			processor_set_t nset = pset_array[istate.spis_pset_id];
			if (__improbable(nset == PROCESSOR_SET_NULL)) {
				continue;
			}
			pset = change_locked_pset(pset, nset);

			processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false);
			if (processor != PROCESSOR_NULL) {
				/* We found a candidate processor on this pset to wake or preempt. */
				pset_assert_locked(processor->processor_set);
				return processor;
			}

			/* TODO <rdar://140219824>: Policy question of EDF vs targeting idle cores on another pset. */
			processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
			if (processor) {
				/*
				 * Instead of looping through all the psets to find the global
				 * furthest deadline processor, preempt the first candidate found.
				 * The preempted thread will then find any other available far deadline
				 * processors to preempt.
				 */
				pset_assert_locked(processor->processor_set);
				return processor;
			}

			found_ast_urgent_pending = found_ast_urgent_pending || (pset->pending_AST_URGENT_cpu_mask != 0);
		}
	}

	/*
	 * There was no obvious (idle or non-realtime) processor to run the thread.
	 * Instead, do EDF scheduling again on starting_pset, putting the thread on
	 * the run queue if there is no processor to preempt.
	 */

	pset = change_locked_pset(pset, starting_pset);
#endif /* __AMP__ */

	/* Check (again, for AMP systems) that there is no lower-priority or idle processor. */
	processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false);
	if (processor != PROCESSOR_NULL) {
		/* We found a candidate processor on this pset to wake or preempt. */
		pset_assert_locked(processor->processor_set);
		return processor;
	}

	processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, true);
	if (processor == PROCESSOR_NULL) {
		/* Choose an arbitrary available and recommended processor from the pset.
		 * It won't get preempted anyways, since this thread has a later
		 * deadline. */
		int processor_id = lsb_first(pset_available_cpumap(pset));

		/* starting_pset had available, recommended processors coming into
		 * rt_choose_processor(), but that might have changed after dropping the
		 * pset lock. If there are no such processors, bail out here and let
		 * sched_edge_migrate_candidate() find a better starting pset. */
		if (processor_id < 0) {
			pset_unlock(pset);
			return PROCESSOR_NULL;
		}

		processor = processor_array[processor_id];
	}

	pset_assert_locked(processor->processor_set);
	return processor;
}
#endif /* !CONFIG_SCHED_SMT */

#if CONFIG_SCHED_EDGE
/*
 * Called with stealing_pset locked and returns with stealing_pset locked but
 * the lock will have been dropped if a thread is returned. The lock may have
 * been temporarily dropped, even if no thread is returned.
 */
thread_t
sched_rt_steal_thread(processor_set_t stealing_pset)
{
	uint64_t earliest_deadline = rt_runq_earliest_deadline(stealing_pset);
	processor_set_t pset = stealing_pset;

	/* Continue searching until there are no steal candidates found in a single iteration. */
	while (true) {
		processor_set_t target_pset = NULL;
		uint64_t target_deadline;
		if (__improbable(os_sub_overflow(earliest_deadline, rt_deadline_epsilon, &target_deadline))) {
			target_deadline = 0;
		}

		sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
		while (sched_iterate_psets_ordered(stealing_pset, &stealing_pset->sched_rt_steal_search_order, ~BIT(stealing_pset->pset_id), &istate)) {
			assert3s(istate.spis_pset_id, !=, stealing_pset->pset_id); /* stealing_pset's runqueue is drained by sched_rt_choose_processor */
			const processor_set_t nset = pset_array[istate.spis_pset_id];
			/* Check for null because we do not use acquire/release semantics for steal order. */
			if (__improbable(nset == PROCESSOR_SET_NULL)) {
				continue;
			}
			uint64_t nset_deadline = os_atomic_load(&nset->stealable_rt_threads_earliest_deadline, relaxed);
			if (nset_deadline < target_deadline) {
				target_pset = nset;
				target_deadline = nset_deadline;
			}
		}

		if (target_pset != PROCESSOR_SET_NULL) {
			assert3u(target_deadline, !=, RT_DEADLINE_NONE);

			/* target_pset is a candidate for steal. Check again under its pset lock. */

			pset = change_locked_pset(pset, target_pset);
			if (os_atomic_load(&pset->stealable_rt_threads_earliest_deadline, relaxed) <= target_deadline) {
				/* Steal the next thread from target_pset's runqueue. */
				thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
				pset_update_rt_stealable_state(pset);
				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);

				pset = change_locked_pset(pset, stealing_pset);
				return new_thread;
			} else {
				/* Failed to steal (another pset stole first). Try again. */
				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(THREAD_NULL), pset->pset_id, pset->cpu_set_low, 1);
				pset = change_locked_pset(pset, stealing_pset);
				/* Update earliest_deadline in case it changed while the stealing_pset lock was not held. */
				earliest_deadline = rt_runq_earliest_deadline(pset);
				continue;
			}
		} else {
			/* No steal candidates, stop searching. */
			break;
		}
	}
	/* No stealable threads, return with stealing_pset locked. */
	pset = change_locked_pset(pset, stealing_pset);
	return THREAD_NULL;
}
#endif /* CONFIG_SCHED_EDGE */

/*
 * processor's pset is locked, may drop and retake the lock
 */
thread_t
sched_rt_choose_thread(processor_t processor)
{
	processor_set_t pset = processor->processor_set;
	pset_assert_locked(pset);

	if (SCHED(rt_steal_thread) != NULL) {
		do {
			rt_clear_pending_spill(processor, 2);
			thread_t new_thread = SCHED(rt_steal_thread)(pset);
			/* pset lock may have been dropped and retaken, is currently locked */
			pset_assert_locked(pset);
			if (new_thread != THREAD_NULL) {
				/* Spill might have been set if the pset lock was dropped in steal. */
				rt_clear_pending_spill(processor, 3);
				return new_thread;
			}
		} while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
	}
	rt_clear_pending_spill(processor, 5);

	if (rt_runq_count(pset) > 0) {
		thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
		assert(new_thread != THREAD_NULL);
		pset_update_rt_stealable_state(pset);
		return new_thread;
	}

	return THREAD_NULL;
}

void
sched_rt_init_pset(processor_set_t pset)
{
	pset_rt_init(pset);
}

void
sched_rt_init_completed(void)
{
	rt_init_completed();
}

void
sched_rt_queue_shutdown(processor_t processor)
{
	processor_set_t pset = processor->processor_set;
	thread_t        thread;
	queue_head_t    tqueue;

	pset_lock(pset);

	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
	if (bit_count(pset_available_cpumap(pset)) > 0) {
		pset_unlock(pset);
		return;
	}

	queue_init(&tqueue);

	while (rt_runq_count(pset) > 0) {
		thread = rt_runq_dequeue(&pset->rt_runq);
		enqueue_tail(&tqueue, &thread->runq_links);
	}
	sched_update_pset_load_average(pset, 0);
	pset_update_rt_stealable_state(pset);
	pset_unlock(pset);

	qe_foreach_element_safe(thread, &tqueue, runq_links) {
		remqueue(&thread->runq_links);

		thread_lock(thread);

		thread_setrun(thread, SCHED_TAILQ);

		thread_unlock(thread);
	}
}

/*
 * Assumes RT lock is not held, and acquires splsched/rt_lock itself.
 * Also records tracepoints for pset bitmasks under the pset lock.
 */
void
sched_rt_runq_scan(sched_update_scan_context_t scan_context)
{
	thread_t        thread;

	pset_node_t node = &pset_node0;
	processor_set_t pset = node->psets;

	spl_t s = splsched();
	do {
		while (pset != NULL) {
			pset_lock(pset);

			bitmap_t *map = pset->rt_runq.bitmap;
			for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
				rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];

				qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
					if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
						scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
					}
				}
			}

			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PSET_BITMASKS),
			    pset->pset_id,
			    pset->recommended_bitmask,
			    pset->perfcontrol_cpu_migration_bitmask,
			    pset->perfcontrol_cpu_preferred_bitmask);

			pset_unlock(pset);

			pset = pset->pset_list;
		}
	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
	splx(s);
}

int64_t
sched_rt_runq_count_sum(void)
{
	pset_node_t node = &pset_node0;
	processor_set_t pset = node->psets;
	int64_t count = 0;

	do {
		while (pset != NULL) {
			count += pset->rt_runq.runq_stats.count_sum;

			pset = pset->pset_list;
		}
	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));

	return count;
}

#pragma mark - Utilities

uint64_t
rt_deadline_add(uint64_t d, uint64_t e)
{
	uint64_t sum;
	return os_add_overflow(d, e, &sum) ? RT_DEADLINE_NONE : sum;
}

cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
{
	cpumap_t avail_map = pset_available_cpumap(pset);
#if CONFIG_SCHED_SMT
	if (!sched_allow_rt_smt) {
		/*
		 * Secondary CPUs are not allowed to run RT threads, so
		 * only primary CPUs should be included
		 */
		avail_map &= pset->primary_map;
	}
#endif /* CONFIG_SCHED_SMT */

	return avail_map & ~pset->realtime_map;
}

/* pset is locked */
static processor_t
pset_choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
{
	(void) consider_secondaries;
	bool skip_spills = true;
	bool include_ast_urgent_pending_cpus = false;

#if CONFIG_SCHED_SMT
	processor_t next_processor = pset_choose_processor_for_realtime_thread_smt(pset, skip_processor, consider_secondaries, skip_spills);
#else /* CONFIG_SCHED_SMT */
	processor_t next_processor = pset_choose_processor_for_realtime_thread(pset, skip_processor, skip_spills);
#endif /* CONFIG_SCHED_SMT */
	if (next_processor != PROCESSOR_NULL) {
		return next_processor;
	}

	next_processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
	return next_processor;
}

#if CONFIG_SCHED_EDGE
/*
 * Realtime Steal Utilities
 *
 * Realtime steal is only supported on platforms with the edge scheduler.
 */

/* Update realtime stealable state. */
void
pset_update_rt_stealable_state(processor_set_t pset)
{
	pset_assert_locked(pset);
	if (rt_pset_has_stealable_threads(pset)) {
		os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, rt_runq_earliest_deadline(pset), relaxed);
	} else {
		os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE, relaxed);
	}
}

bool
rt_pset_has_stealable_threads(processor_set_t pset)
{
	cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);

	return rt_runq_count(pset) > bit_count(avail_map);
}

/*
 * Returns the next processor to IPI for a migrating realtime thread. Realtime
 * spill is only supported with the edge scheduler.
 *
 * Expects starting_pset to be locked. Returns false if starting_pset was never
 * unlocked; otherwise, returns true with no lock held.
 */
bool
rt_choose_next_processor_for_spill_IPI(
	processor_set_t  starting_pset,
	processor_t      chosen_processor,
	processor_t      *result_processor,
	sched_ipi_type_t *result_ipi_type
	)
{
	assert3p(starting_pset, !=, PROCESSOR_SET_NULL);
	assert3p(chosen_processor, !=, PROCESSOR_NULL);

	uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
	int max_pri = rt_runq_priority(starting_pset);
	__kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
	processor_set_t pset = starting_pset; /* lock is held on this pset */
	processor_t next_rt_processor = PROCESSOR_NULL;
	/* Optimization so caller can avoid unnecessary lock-takes if there are no psets eligible for spill: */
	bool starting_pset_was_unlocked = false;

	cpumap_t candidate_map = ~BIT(starting_pset->pset_id); /* exclude stealing_pset */
	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
	while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, candidate_map, &istate)) {
		assert3u(starting_pset->pset_id, !=, istate.spis_pset_id);
		/* Check for null pset because we do not use acquire/release semantics for spill order. */
		processor_set_t nset = pset_array[istate.spis_pset_id];
		if (__improbable(nset == PROCESSOR_SET_NULL)) {
			continue;
		}

		/* Make sure the pset is allowed to steal threads from stealing_pset's runqueue. */
		sched_clutch_edge edge = sched_rt_config_get((pset_id_t) starting_pset->pset_id, (pset_id_t) istate.spis_pset_id);
		if (istate.spis_pset_id != starting_pset->pset_id && edge.sce_steal_allowed == false) {
			continue;
		}
		pset = change_locked_pset(pset, nset);
		starting_pset_was_unlocked = true;

		next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true);
		if (next_rt_processor != PROCESSOR_NULL) {
			break;
		}
	}

	if (next_rt_processor != PROCESSOR_NULL) {
		if (pset != starting_pset) {
			if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
				    next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, spill_tid);
			}
		}
		*result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
		*result_processor = next_rt_processor;
	}
	if (starting_pset_was_unlocked) {
		pset_unlock(pset);
		return true;
	} else {
		return false;
	}
}
#endif /* CONFIG_SCHED_EDGE */

bool
rt_pset_needs_a_followup_IPI(processor_set_t pset)
{
	int nbackup_cpus = 0;

	if (rt_runq_is_low_latency(pset)) {
		nbackup_cpus = sched_rt_n_backup_processors;
	}

	int rt_rq_count = rt_runq_count(pset);

	return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
}

/*
 * Returns the next processor to IPI as a followup for low-latency realtime
 * threads on the runqueue.
 *
 * pset should be locked, and stays locked the whole time.
 */
void
rt_choose_next_processor_for_followup_IPI(
	processor_set_t pset,
	processor_t chosen_processor,
	processor_t *result_processor,
	sched_ipi_type_t *result_ipi_type)
{
	uint64_t earliest_deadline = rt_runq_earliest_deadline(pset);
	int max_pri = rt_runq_priority(pset);
	processor_t next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true);
	if (next_rt_processor != PROCESSOR_NULL) {
		*result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
		*result_processor = next_rt_processor;
	}
}

#if CONFIG_SCHED_SMT
extern int sched_avoid_cpu0;
extern int sched_allow_rt_smt;

/* pset is locked */
processor_t
pset_choose_processor_for_realtime_thread_smt(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
{
#if defined(__x86_64__)
	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
#else
	const bool avoid_cpu0 = false;
#endif
	cpumap_t cpu_map;

try_again:
	cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
	if (skip_processor) {
		bit_clear(cpu_map, skip_processor->cpu_id);
	}
	if (skip_spills) {
		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
	}

	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
		bit_clear(cpu_map, 0);
	}

	cpumap_t primary_map = cpu_map & pset->primary_map;
	if (avoid_cpu0) {
		primary_map = bit_ror64(primary_map, 1);
	}

	int rotid = lsb_first(primary_map);
	if (rotid >= 0) {
		int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;

		processor_t processor = processor_array[cpuid];

		return processor;
	}

	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
		goto out;
	}

	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
		/* Also avoid cpu1 */
		bit_clear(cpu_map, 1);
	}

	/* Consider secondary processors whose primary is actually running a realtime thread */
	cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
	if (avoid_cpu0) {
		/* Also avoid cpu1 */
		secondary_map = bit_ror64(secondary_map, 2);
	}
	rotid = lsb_first(secondary_map);
	if (rotid >= 0) {
		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;

		processor_t processor = processor_array[cpuid];

		return processor;
	}

	/* Consider secondary processors */
	secondary_map = cpu_map & ~pset->primary_map;
	if (avoid_cpu0) {
		/* Also avoid cpu1 */
		secondary_map = bit_ror64(secondary_map, 2);
	}
	rotid = lsb_first(secondary_map);
	if (rotid >= 0) {
		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;

		processor_t processor = processor_array[cpuid];

		return processor;
	}

	/*
	 * I was hoping the compiler would optimize
	 * this away when avoid_cpu0 is const bool false
	 * but it still complains about the assignmnent
	 * in that case.
	 */
	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
#if defined(__x86_64__)
		avoid_cpu0 = false;
#else
		assert(0);
#endif
		goto try_again;
	}

out:
	if (skip_processor) {
		return PROCESSOR_NULL;
	}

	/*
	 * If we didn't find an obvious processor to choose, but there are still more CPUs
	 * not already running realtime threads than realtime threads in the realtime run queue,
	 * this thread belongs in this pset, so choose some other processor in this pset
	 * to ensure the thread is enqueued here.
	 */
	cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
		cpu_map = non_realtime_map;
		assert(cpu_map != 0);
		int cpuid = bit_first(cpu_map);
		assert(cpuid >= 0);
		return processor_array[cpuid];
	}

	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
		goto skip_secondaries;
	}

	non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
		cpu_map = non_realtime_map;
		assert(cpu_map != 0);
		int cpuid = bit_first(cpu_map);
		assert(cpuid >= 0);
		return processor_array[cpuid];
	}

skip_secondaries:
	return PROCESSOR_NULL;
}
#else /* !CONFIG_SCHED_SMT*/
/* pset is locked */
processor_t
pset_choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills)
{
	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
	if (skip_processor) {
		bit_clear(cpu_map, skip_processor->cpu_id);
	}
	if (skip_spills) {
		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
	}

	int rotid = lsb_first(cpu_map);
	if (rotid >= 0) {
		return processor_array[rotid];
	}

	/*
	 * If we didn't find an obvious processor to choose, but there are still more CPUs
	 * not already running realtime threads than realtime threads in the realtime run queue,
	 * this thread belongs in this pset, so choose some other processor in this pset
	 * to ensure the thread is enqueued here.
	 */
	cpumap_t non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
		cpu_map = non_realtime_map;
		assert(cpu_map != 0);
		int cpuid = bit_first(cpu_map);
		assert(cpuid >= 0);
		return processor_array[cpuid];
	}

	return PROCESSOR_NULL;
}
#endif /* !CONFIG_SCHED_SMT */

/*
 * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
 * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
 *
 * pset is locked.
 */
processor_t
pset_choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
{
	uint64_t  furthest_deadline = rt_deadline_add(minimum_deadline, rt_deadline_epsilon);
	processor_t fd_processor = PROCESSOR_NULL;
	int lowest_priority = max_pri;

	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
	if (skip_processor) {
		bit_clear(cpu_map, skip_processor->cpu_id);
	}
	if (skip_spills) {
		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
	}

	for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
		processor_t processor = processor_array[cpuid];

		if (processor->current_pri > lowest_priority) {
			continue;
		}

		if (processor->current_pri < lowest_priority) {
			lowest_priority = processor->current_pri;
			furthest_deadline = processor->deadline;
			fd_processor = processor;
			continue;
		}

		if (processor->deadline > furthest_deadline) {
			furthest_deadline = processor->deadline;
			fd_processor = processor;
		}
	}

	if (fd_processor) {
		return fd_processor;
	}

	/*
	 * There is a race condition possible when there are multiple processor sets.
	 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
	 * so it drops pset lock A and tries to take pset lock B.  Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
	 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
	 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
	 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
	 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
	 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
	 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
	 *
	 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
	 * on the run queue of that pset.
	 */
	if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
		cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
		assert(skip_processor == PROCESSOR_NULL);
		assert(skip_spills == false);

		for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
			processor_t processor = processor_array[cpuid];

			if (processor->current_pri > lowest_priority) {
				continue;
			}

			if (processor->current_pri < lowest_priority) {
				lowest_priority = processor->current_pri;
				furthest_deadline = processor->deadline;
				fd_processor = processor;
				continue;
			}

			if (processor->deadline > furthest_deadline) {
				furthest_deadline = processor->deadline;
				fd_processor = processor;
			}
		}
	}

	return fd_processor;
}

bool
rt_clear_pending_spill(processor_t processor, int reason)
{
	processor_set_t pset = processor->processor_set;
	if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, reason);
		return true;
	} else {
		return false;
	}
}

#pragma mark - Realtime Runqueues

#if DEBUG || SCHED_TEST_HARNESS
void
check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
{
	bitmap_t *map = rt_run_queue->bitmap;

	uint64_t earliest_deadline = RT_DEADLINE_NONE;
	uint32_t constraint = RT_CONSTRAINT_NONE;
	int ed_index = NOPRI;
	int count = 0;
	bool found_thread = false;

	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
		int i = pri - BASEPRI_RTQUEUES;
		rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
		queue_t queue = &rt_runq->pri_queue;
		queue_entry_t iter;
		int n = 0;
		uint64_t previous_deadline = 0;
		qe_foreach(iter, queue) {
			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
			assert_thread_magic(iter_thread);
			if (iter_thread == thread) {
				found_thread = true;
			}
			assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
			assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
			assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
			assert(previous_deadline <= iter_thread->realtime.deadline);
			n++;
			if (iter == queue_first(queue)) {
				assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
				assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
			}
			previous_deadline = iter_thread->realtime.deadline;
		}
		assert(n == rt_runq->pri_count);
		if (n == 0) {
			assert(bitmap_test(map, i) == false);
			assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
			assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
		} else {
			assert(bitmap_test(map, i) == true);
		}
		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
			earliest_deadline = rt_runq->pri_earliest_deadline;
			constraint = rt_runq->pri_constraint;
			ed_index = i;
		}
		count += n;
	}
	assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
	assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
	assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
	assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
	if (thread) {
		assert(found_thread);
	}
}
#endif /* DEBUG || SCHED_TEST_HARNESS */

static bool
rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
{
	int pri = thread->sched_pri;
	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
	int i = pri - BASEPRI_RTQUEUES;
	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
	bitmap_t *map = rt_run_queue->bitmap;

	bitmap_set(map, i);

	queue_t     queue       = &rt_runq->pri_queue;
	uint64_t    deadline    = thread->realtime.deadline;
	bool        preempt     = false;
	bool        earliest    = false;

	if (queue_empty(queue)) {
		enqueue_tail(queue, &thread->runq_links);
		preempt = true;
		earliest = true;
		rt_runq->pri_earliest_deadline = deadline;
		rt_runq->pri_constraint = thread->realtime.constraint;
	} else {
		/* Insert into rt_runq in thread deadline order */
		queue_entry_t iter;
		qe_foreach(iter, queue) {
			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
			assert_thread_magic(iter_thread);

			if (deadline < iter_thread->realtime.deadline) {
				if (iter == queue_first(queue)) {
					preempt = true;
					earliest = true;
					rt_runq->pri_earliest_deadline = deadline;
					rt_runq->pri_constraint = thread->realtime.constraint;
				}
				insque(&thread->runq_links, queue_prev(iter));
				break;
			} else if (iter == queue_last(queue)) {
				enqueue_tail(queue, &thread->runq_links);
				break;
			}
		}
	}
	if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
		os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
		os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
		os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
	}

	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
	rt_runq->pri_count++;
	os_atomic_inc(&rt_run_queue->count, relaxed);

	thread_set_runq_locked(thread, processor);

	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);

	return preempt;
}

uint64_t
rt_runq_earliest_deadline(processor_set_t pset)
{
	return os_atomic_load_wide(&pset->rt_runq.earliest_deadline, relaxed);
}

/*
 *	rt_runq_insert:
 *
 *	Enqueue a thread for realtime execution.
 */
bool
rt_runq_insert(processor_t processor, processor_set_t pset, thread_t thread)
{
	pset_assert_locked(pset);

	bool preempt = rt_runq_enqueue(&pset->rt_runq, thread, processor);
	pset_update_rt_stealable_state(pset);

	return preempt;
}

int
rt_runq_count(processor_set_t pset)
{
	return os_atomic_load(&pset->rt_runq.count, relaxed);
}

int
rt_runq_priority(processor_set_t pset)
{
	pset_assert_locked(pset);
	rt_queue_t rt_run_queue = &pset->rt_runq;

	bitmap_t *map = rt_run_queue->bitmap;
	int i = bitmap_first(map, NRTQS);
	assert(i < NRTQS);

	if (i >= 0) {
		return i + BASEPRI_RTQUEUES;
	}

	return i;
}

bool
rt_runq_is_low_latency(processor_set_t pset)
{
	return os_atomic_load(&pset->rt_runq.constraint, relaxed) <= rt_constraint_threshold;
}

thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)
{
	bitmap_t *map = rt_run_queue->bitmap;
	int i = bitmap_first(map, NRTQS);
	assert((i >= 0) && (i < NRTQS));

	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];

	if (!sched_rt_runq_strict_priority) {
		int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
		if (ed_index != i) {
			assert((ed_index >= 0) && (ed_index < NRTQS));
			rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];

			thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
			thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);

			if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
				/* choose the earliest deadline thread */
				rt_runq = ed_runq;
				i = ed_index;
			}
		}
	}

	assert(rt_runq->pri_count > 0);
	uint64_t earliest_deadline = RT_DEADLINE_NONE;
	uint32_t constraint = RT_CONSTRAINT_NONE;
	int ed_index = NOPRI;
	thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
	if (--rt_runq->pri_count > 0) {
		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
		assert(next_rt != THREAD_NULL);
		earliest_deadline = next_rt->realtime.deadline;
		constraint = next_rt->realtime.constraint;
		ed_index = i;
	} else {
		bitmap_clear(map, i);
	}
	rt_runq->pri_earliest_deadline = earliest_deadline;
	rt_runq->pri_constraint = constraint;

	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
		rt_runq = &rt_run_queue->rt_queue_pri[i];
		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
			earliest_deadline = rt_runq->pri_earliest_deadline;
			constraint = rt_runq->pri_constraint;
			ed_index = i;
		}
	}
	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
	os_atomic_dec(&rt_run_queue->count, relaxed);

	thread_clear_runq(new_thread);

	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);

	return new_thread;
}

thread_t
rt_runq_first(rt_queue_t rt_run_queue)
{
	bitmap_t *map = rt_run_queue->bitmap;
	int i = bitmap_first(map, NRTQS);
	if (i < 0) {
		return THREAD_NULL;
	}
	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
	thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);

	return next_rt;
}

void
rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
{
	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);

	int pri = thread->sched_pri;
	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
	int i = pri - BASEPRI_RTQUEUES;
	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
	bitmap_t *map = rt_run_queue->bitmap;

	assert(rt_runq->pri_count > 0);
	uint64_t earliest_deadline = RT_DEADLINE_NONE;
	uint32_t constraint = RT_CONSTRAINT_NONE;
	int ed_index = NOPRI;
	remqueue(&thread->runq_links);
	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
	if (--rt_runq->pri_count > 0) {
		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
		earliest_deadline = next_rt->realtime.deadline;
		constraint = next_rt->realtime.constraint;
		ed_index = i;
	} else {
		bitmap_clear(map, i);
	}
	rt_runq->pri_earliest_deadline = earliest_deadline;
	rt_runq->pri_constraint = constraint;

	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
		rt_runq = &rt_run_queue->rt_queue_pri[i];
		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
			earliest_deadline = rt_runq->pri_earliest_deadline;
			constraint = rt_runq->pri_constraint;
			ed_index = i;
		}
	}
	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
	os_atomic_dec(&rt_run_queue->count, relaxed);

	thread_clear_runq_locked(thread);

	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
}