/* * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_FREE_COPYRIGHT@ */ /* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ /* */ /* * File: sched_prim.c * Author: Avadis Tevanian, Jr. * Date: 1986 * * Scheduling primitives * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDBG_MACOS_RELEASE #define KTRC KDBG_MACOS_RELEASE #else #define KTRC KDBG_RELEASE #endif struct sched_statistics PERCPU_DATA(sched_stats); bool sched_stats_active; static uint64_t deadline_add(uint64_t d, uint64_t e) { uint64_t sum; return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum; } int rt_runq_count(processor_set_t pset) { return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed); } uint64_t rt_runq_earliest_deadline(processor_set_t pset) { return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed); } static int rt_runq_priority(processor_set_t pset) { pset_assert_locked(pset); rt_queue_t rt_run_queue = SCHED(rt_runq)(pset); bitmap_t *map = rt_run_queue->bitmap; int i = bitmap_first(map, NRTQS); assert(i < NRTQS); if (i >= 0) { return i + BASEPRI_RTQUEUES; } return i; } static thread_t rt_runq_first(rt_queue_t rt_runq); #if DEBUG static void check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread) { bitmap_t *map = rt_run_queue->bitmap; uint64_t earliest_deadline = RT_DEADLINE_NONE; uint32_t constraint = RT_CONSTRAINT_NONE; int ed_index = NOPRI; int count = 0; bool found_thread = false; for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) { int i = pri - BASEPRI_RTQUEUES; rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; queue_t queue = &rt_runq->pri_queue; queue_entry_t iter; int n = 0; uint64_t previous_deadline = 0; qe_foreach(iter, queue) { thread_t iter_thread = qe_element(iter, struct thread, runq_links); assert_thread_magic(iter_thread); if (iter_thread == thread) { found_thread = true; } assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES)); assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE); assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE); assert(previous_deadline <= iter_thread->realtime.deadline); n++; if (iter == queue_first(queue)) { assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline); assert(rt_runq->pri_constraint == iter_thread->realtime.constraint); } previous_deadline = iter_thread->realtime.deadline; } assert(n == rt_runq->pri_count); if (n == 0) { assert(bitmap_test(map, i) == false); assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE); assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE); } else { assert(bitmap_test(map, i) == true); } if (rt_runq->pri_earliest_deadline < earliest_deadline) { earliest_deadline = rt_runq->pri_earliest_deadline; constraint = rt_runq->pri_constraint; ed_index = i; } count += n; } assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline); assert(os_atomic_load(&rt_run_queue->count, relaxed) == count); assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint); assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index); if (thread) { assert(found_thread); } } #define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th) #else #define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0) #endif uint32_t rt_constraint_threshold; static bool rt_runq_is_low_latency(processor_set_t pset) { return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold; } TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true); /* TODO: enable this, to 50us (less than the deferred IPI latency, to beat a spill) */ TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 0); /* microseconds */ static uint64_t nonurgent_preemption_timer_abs = 0; #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */ TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE); #define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */ TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE); #if XNU_TARGET_OS_XR #define MAX_UNSAFE_RT_QUANTA 1 #define SAFE_RT_MULTIPLIER 5 #else #define MAX_UNSAFE_RT_QUANTA 100 #define SAFE_RT_MULTIPLIER 2 #endif /* XNU_TARGET_OS_XR */ #define MAX_UNSAFE_FIXED_QUANTA 100 #define SAFE_FIXED_MULTIPLIER 2 TUNABLE_DEV_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA); TUNABLE_DEV_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA); TUNABLE_DEV_WRITEABLE(int, safe_rt_multiplier, "safe_rt_multiplier", SAFE_RT_MULTIPLIER); TUNABLE_DEV_WRITEABLE(int, safe_fixed_multiplier, "safe_fixed_multiplier", SAFE_RT_MULTIPLIER); #define MAX_POLL_QUANTA 2 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA); #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */ int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT; uint64_t max_poll_computation; uint64_t max_unsafe_rt_computation; uint64_t max_unsafe_fixed_computation; uint64_t sched_safe_rt_duration; uint64_t sched_safe_fixed_duration; #if defined(CONFIG_SCHED_TIMESHARE_CORE) uint32_t std_quantum; uint32_t min_std_quantum; uint32_t bg_quantum; uint32_t std_quantum_us; uint32_t bg_quantum_us; #endif /* CONFIG_SCHED_TIMESHARE_CORE */ uint32_t thread_depress_time; uint32_t default_timeshare_computation; uint32_t default_timeshare_constraint; uint32_t max_rt_quantum; uint32_t min_rt_quantum; uint32_t rt_deadline_epsilon; uint32_t rt_constraint_threshold; #if defined(CONFIG_SCHED_TIMESHARE_CORE) unsigned sched_tick; uint32_t sched_tick_interval; /* Timeshare load calculation interval (15ms) */ uint32_t sched_load_compute_interval_us = 15000; uint64_t sched_load_compute_interval_abs; static _Atomic uint64_t sched_load_compute_deadline; uint32_t sched_pri_shifts[TH_BUCKET_MAX]; uint32_t sched_fixed_shift; uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */ /* Allow foreground to decay past default to resolve inversions */ #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2) int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT; /* Defaults for timer deadline profiling */ #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <= * 2ms */ #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines * <= 5ms */ uint64_t timer_deadline_tracking_bin_1; uint64_t timer_deadline_tracking_bin_2; #endif /* CONFIG_SCHED_TIMESHARE_CORE */ thread_t sched_maintenance_thread; LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown"); LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp); /* interrupts disabled lock to guard core online, recommendation, pcs state */ decl_simple_lock_data(, sched_available_cores_lock); /* * Locked by sched_available_cores_lock. * cluster_powerdown_lock is held while making changes to CPU offline state. */ static struct global_powered_cores_state { /* * Set when PCS has seen all cores boot up and is ready to manage online * state. CPU recommendation works before this point. */ bool pcs_init_completed; cpumap_t pcs_managed_cores; /* all cores managed by the PCS */ /* * Inputs for CPU offline state provided by clients */ cpumap_t pcs_requested_online_user; /* updated by processor_start/exit from userspace */ cpumap_t pcs_requested_online_clpc_user; cpumap_t pcs_requested_online_clpc_system; cpumap_t pcs_required_online_pmgr; /* e.g. ANE needs these powered for their rail to be happy */ cpumap_t pcs_required_online_system; /* e.g. smt1 for interrupts, boot processor unless boot arg is set, makes them disable instead of sleep */ /* * When a suspend count is held, all CPUs must be powered up. */ int32_t pcs_powerdown_suspend_count; /* * Disable automatic cluster powerdown in favor of explicit user core online control */ bool pcs_user_online_core_control; bool pcs_wants_kernel_sleep; bool pcs_in_kernel_sleep; struct powered_cores_state { /* * The input into the recommendation computation from update powered cores. */ cpumap_t pcs_powerdown_recommended_cores; /* * These cores are online and are not powered down. * * Processors with processor->processor_online bit set. */ cpumap_t pcs_online_cores; /* * These cores are disabled or powered down * due to temporary reasons and will come back under presented load * so the user should still see them as active in the cpu count. * * Processors with processor->shutdown_temporary bit set. */ cpumap_t pcs_tempdown_cores; } pcs_effective; /* The 'goal state' PCS has computed and is attempting to apply */ struct powered_cores_state pcs_requested; /* * Inputs into CPU recommended cores provided by clients. * Note that these may be changed under the available cores lock and * become effective while sched_update_powered_cores_drops_lock is in * the middle of making changes to CPU online state. */ cpumap_t pcs_requested_recommended_clpc; cpumap_t pcs_requested_recommended_clpc_system; cpumap_t pcs_requested_recommended_clpc_user; bool pcs_recommended_clpc_failsafe_active; bool pcs_sleep_override_recommended; /* * These cores are recommended and can be used for execution * of non-bound threads. * * Processors with processor->is_recommended bit set. */ cpumap_t pcs_recommended_cores; /* * These are for the debugger. * Use volatile to stop the compiler from optimizing out the stores */ volatile processor_reason_t pcs_in_flight_reason; volatile processor_reason_t pcs_previous_reason; } pcs = { /* * Powerdown is suspended during boot until after all CPUs finish booting, * released by sched_cpu_init_completed. */ .pcs_powerdown_suspend_count = 1, .pcs_requested_online_user = ALL_CORES_POWERED, .pcs_requested_online_clpc_user = ALL_CORES_POWERED, .pcs_requested_online_clpc_system = ALL_CORES_POWERED, .pcs_in_flight_reason = REASON_NONE, .pcs_previous_reason = REASON_NONE, .pcs_requested.pcs_powerdown_recommended_cores = ALL_CORES_POWERED, .pcs_requested_recommended_clpc = ALL_CORES_RECOMMENDED, .pcs_requested_recommended_clpc_system = ALL_CORES_RECOMMENDED, .pcs_requested_recommended_clpc_user = ALL_CORES_RECOMMENDED, }; uint64_t sysctl_sched_recommended_cores = ALL_CORES_RECOMMENDED; static int sched_last_resort_cpu(void); static void sched_update_recommended_cores_locked(processor_reason_t reason, cpumap_t core_going_offline); static void sched_update_powered_cores_drops_lock(processor_reason_t requested_reason, spl_t s); #if __arm64__ static void sched_recommended_cores_maintenance(void); uint64_t perfcontrol_failsafe_starvation_threshold; extern char *proc_name_address(struct proc *p); #endif /* __arm64__ */ uint64_t sched_one_second_interval; boolean_t allow_direct_handoff = TRUE; /* Forwards */ #if defined(CONFIG_SCHED_TIMESHARE_CORE) static void load_shift_init(void); static void preempt_pri_init(void); #endif /* CONFIG_SCHED_TIMESHARE_CORE */ thread_t processor_idle( thread_t thread, processor_t processor); static ast_t csw_check_locked( thread_t thread, processor_t processor, processor_set_t pset, ast_t check_reason); static void processor_setrun( processor_t processor, thread_t thread, integer_t options); static void sched_realtime_timebase_init(void); static void sched_timer_deadline_tracking_init(void); #if DEBUG extern int debug_task; #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args) #else #define TLOG(a, fmt, args...) do {} while (0) #endif static processor_t thread_bind_internal( thread_t thread, processor_t processor); static void sched_vm_group_maintenance(void); #if defined(CONFIG_SCHED_TIMESHARE_CORE) int8_t sched_load_shifts[NRQS]; bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)]; #endif /* CONFIG_SCHED_TIMESHARE_CORE */ #define cpumap_foreach(cpu_id, cpumap) \ for (int cpu_id = lsb_first(cpumap); \ (cpu_id) >= 0; \ cpu_id = lsb_next((cpumap), cpu_id)) #define foreach_node(node) \ for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) #define foreach_pset_id(pset_id, node) \ for (int pset_id = lsb_first((node)->pset_map); \ pset_id >= 0; \ pset_id = lsb_next((node)->pset_map, pset_id)) /* * Statically allocate a buffer to hold the longest possible * scheduler description string, as currently implemented. * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/ * to export to userspace via sysctl(3). If either version * changes, update the other. * * Note that in addition to being an upper bound on the strings * in the kernel, it's also an exact parameter to PE_get_default(), * which interrogates the device tree on some platforms. That * API requires the caller know the exact size of the device tree * property, so we need both a legacy size (32) and the current size * (48) to deal with old and new device trees. The device tree property * is similarly padded to a fixed size so that the same kernel image * can run on multiple devices with different schedulers configured * in the device tree. */ char sched_string[SCHED_STRING_MAX_LENGTH]; uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS; /* Global flag which indicates whether Background Stepper Context is enabled */ static int cpu_throttle_enabled = 1; #if DEVELOPMENT || DEBUG int enable_task_set_cluster_type = 0; bool system_ecore_only = false; #endif /* DEVELOPMENT || DEBUG */ void sched_init(void) { boolean_t direct_handoff = FALSE; kprintf("Scheduler: Default of %s\n", SCHED(sched_name)); if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) { /* No boot-args, check in device tree */ if (!PE_get_default("kern.sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) { /* Allow decay all the way to normal limits */ sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT; } } kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit); if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) { kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags); } strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string)); #if __arm64__ clock_interval_to_absolutetime_interval(expecting_ipi_wfe_timeout_usec, NSEC_PER_USEC, &expecting_ipi_wfe_timeout_mt); #endif /* __arm64__ */ SCHED(init)(); SCHED(rt_init)(&pset0); sched_timer_deadline_tracking_init(); SCHED(pset_init)(&pset0); SCHED(processor_init)(master_processor); if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) { allow_direct_handoff = direct_handoff; } #if DEVELOPMENT || DEBUG if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) { system_ecore_only = (enable_task_set_cluster_type == 2); } #endif /* DEVELOPMENT || DEBUG */ } void sched_timebase_init(void) { uint64_t abstime; clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime); sched_one_second_interval = abstime; SCHED(timebase_init)(); sched_realtime_timebase_init(); } #if defined(CONFIG_SCHED_TIMESHARE_CORE) void sched_timeshare_init(void) { /* * Calculate the timeslicing quantum * in us. */ if (default_preemption_rate < 1) { default_preemption_rate = DEFAULT_PREEMPTION_RATE; } std_quantum_us = (1000 * 1000) / default_preemption_rate; printf("standard timeslicing quantum is %d us\n", std_quantum_us); if (default_bg_preemption_rate < 1) { default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE; } bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate; printf("standard background quantum is %d us\n", bg_quantum_us); load_shift_init(); preempt_pri_init(); sched_tick = 0; } void sched_set_max_unsafe_rt_quanta(int max) { const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL); max_unsafe_rt_computation = ((uint64_t)max) * quantum_size; const int mult = safe_rt_multiplier <= 0 ? 2 : safe_rt_multiplier; sched_safe_rt_duration = mult * ((uint64_t)max) * quantum_size; #if DEVELOPMENT || DEBUG max_unsafe_rt_quanta = max; #else /* * On RELEASE kernels, this is only called on boot where * max is already equal to max_unsafe_rt_quanta. */ assert3s(max, ==, max_unsafe_rt_quanta); #endif } void sched_set_max_unsafe_fixed_quanta(int max) { const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL); max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size; const int mult = safe_fixed_multiplier <= 0 ? 2 : safe_fixed_multiplier; sched_safe_fixed_duration = mult * ((uint64_t)max) * quantum_size; #if DEVELOPMENT || DEBUG max_unsafe_fixed_quanta = max; #else /* * On RELEASE kernels, this is only called on boot where * max is already equal to max_unsafe_fixed_quanta. */ assert3s(max, ==, max_unsafe_fixed_quanta); #endif } void sched_timeshare_timebase_init(void) { uint64_t abstime; uint32_t shift; /* standard timeslicing quantum */ clock_interval_to_absolutetime_interval( std_quantum_us, NSEC_PER_USEC, &abstime); assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); std_quantum = (uint32_t)abstime; /* smallest remaining quantum (250 us) */ clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime); assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); min_std_quantum = (uint32_t)abstime; /* quantum for background tasks */ clock_interval_to_absolutetime_interval( bg_quantum_us, NSEC_PER_USEC, &abstime); assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); bg_quantum = (uint32_t)abstime; /* scheduler tick interval */ clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT, NSEC_PER_USEC, &abstime); assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); sched_tick_interval = (uint32_t)abstime; /* timeshare load calculation interval & deadline initialization */ clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs); os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs); /* * Compute conversion factor from usage to * timesharing priorities with 5/8 ** n aging. */ abstime = (abstime * 5) / 3; for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) { abstime >>= 1; } sched_fixed_shift = shift; for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) { sched_pri_shifts[i] = INT8_MAX; } sched_set_max_unsafe_rt_quanta(max_unsafe_rt_quanta); sched_set_max_unsafe_fixed_quanta(max_unsafe_fixed_quanta); max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum; thread_depress_time = 1 * std_quantum; default_timeshare_computation = std_quantum / 2; default_timeshare_constraint = std_quantum; #if __arm64__ perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval); #endif /* __arm64__ */ if (nonurgent_preemption_timer_us) { clock_interval_to_absolutetime_interval(nonurgent_preemption_timer_us, NSEC_PER_USEC, &abstime); nonurgent_preemption_timer_abs = abstime; } } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ void pset_rt_init(processor_set_t pset) { for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) { int i = pri - BASEPRI_RTQUEUES; rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i]; queue_init(&rqi->pri_queue); rqi->pri_count = 0; rqi->pri_earliest_deadline = RT_DEADLINE_NONE; rqi->pri_constraint = RT_CONSTRAINT_NONE; } os_atomic_init(&pset->rt_runq.count, 0); os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE); os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE); os_atomic_init(&pset->rt_runq.ed_index, NOPRI); memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats); } /* epsilon for comparing RT deadlines */ int rt_deadline_epsilon_us = 100; int sched_get_rt_deadline_epsilon(void) { return rt_deadline_epsilon_us; } void sched_set_rt_deadline_epsilon(int new_epsilon_us) { rt_deadline_epsilon_us = new_epsilon_us; uint64_t abstime; clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime); assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0)); rt_deadline_epsilon = (uint32_t)abstime; } static void sched_realtime_timebase_init(void) { uint64_t abstime; /* smallest rt computation (50 us) */ clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime); assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); min_rt_quantum = (uint32_t)abstime; /* maximum rt computation (50 ms) */ clock_interval_to_absolutetime_interval( 50, 1000 * NSEC_PER_USEC, &abstime); assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); max_rt_quantum = (uint32_t)abstime; /* constraint threshold for sending backup IPIs (4 ms) */ clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime); assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); rt_constraint_threshold = (uint32_t)abstime; /* epsilon for comparing deadlines */ sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us); } void sched_check_spill(processor_set_t pset, thread_t thread) { (void)pset; (void)thread; return; } bool sched_thread_should_yield(processor_t processor, thread_t thread) { (void)thread; return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0; } /* Default implementations of .steal_thread_enabled */ bool sched_steal_thread_DISABLED(processor_set_t pset) { (void)pset; return false; } bool sched_steal_thread_enabled(processor_set_t pset) { return bit_count(pset->node->pset_map) > 1; } #if defined(CONFIG_SCHED_TIMESHARE_CORE) /* * Set up values for timeshare * loading factors. */ static void load_shift_init(void) { int8_t k, *p = sched_load_shifts; uint32_t i, j; uint32_t sched_decay_penalty = 1; if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) { kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty); } if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) { kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor); } if (sched_decay_penalty == 0) { /* * There is no penalty for timeshare threads for using too much * CPU, so set all load shifts to INT8_MIN. Even under high load, * sched_pri_shift will be >INT8_MAX, and there will be no * penalty applied to threads (nor will sched_usage be updated per * thread). */ for (i = 0; i < NRQS; i++) { sched_load_shifts[i] = INT8_MIN; } return; } *p++ = INT8_MIN; *p++ = 0; /* * For a given system load "i", the per-thread priority * penalty per quantum of CPU usage is ~2^k priority * levels. "sched_decay_penalty" can cause more * array entries to be filled with smaller "k" values */ for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) { for (j <<= 1; (i < j) && (i < NRQS); ++i) { *p++ = k; } } } static void preempt_pri_init(void) { bitmap_t *p = sched_preempt_pri; for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) { bitmap_set(p, i); } for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) { bitmap_set(p, i); } } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ void check_monotonic_time(uint64_t ctime) { processor_t processor = current_processor(); uint64_t last_dispatch = processor->last_dispatch; if (last_dispatch > ctime) { panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx", last_dispatch, ctime); } } /* * Thread wait timer expiration. * Runs in timer interrupt context with interrupts disabled. */ void thread_timer_expire(void *p0, __unused void *p1) { thread_t thread = (thread_t)p0; assert_thread_magic(thread); assert(ml_get_interrupts_enabled() == FALSE); thread_lock(thread); if (thread->wait_timer_armed) { thread->wait_timer_armed = false; clear_wait_internal(thread, THREAD_TIMED_OUT); /* clear_wait_internal may have dropped and retaken the thread lock */ } thread->wait_timer_active--; thread_unlock(thread); } /* * thread_unblock: * * Unblock thread on wake up. * * Returns TRUE if the thread should now be placed on the runqueue. * * Thread must be locked. * * Called at splsched(). */ boolean_t thread_unblock( thread_t thread, wait_result_t wresult) { boolean_t ready_for_runq = FALSE; thread_t cthread = current_thread(); uint32_t new_run_count; int old_thread_state; /* * Set wait_result. */ thread->wait_result = wresult; /* * Cancel pending wait timer. */ if (thread->wait_timer_armed) { if (timer_call_cancel(thread->wait_timer)) { thread->wait_timer_active--; } thread->wait_timer_armed = false; } boolean_t aticontext, pidle; ml_get_power_state(&aticontext, &pidle); /* * Update scheduling state: not waiting, * set running. */ old_thread_state = thread->state; thread->state = (old_thread_state | TH_RUN) & ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT | TH_WAKING); if ((old_thread_state & TH_RUN) == 0) { uint64_t ctime = mach_approximate_time(); check_monotonic_time(ctime); thread->last_made_runnable_time = thread->last_basepri_change_time = ctime; timer_start(&thread->runnable_timer, ctime); ready_for_runq = TRUE; if (old_thread_state & TH_WAIT_REPORT) { (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); } /* Update the runnable thread count */ new_run_count = SCHED(run_count_incr)(thread); #if CONFIG_SCHED_AUTO_JOIN if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) { work_interval_auto_join_propagate(cthread, thread); } #endif /*CONFIG_SCHED_AUTO_JOIN */ } else { /* * Either the thread is idling in place on another processor, * or it hasn't finished context switching yet. */ assert((thread->state & TH_IDLE) == 0); /* * The run count is only dropped after the context switch completes * and the thread is still waiting, so we should not run_incr here */ new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed); } /* * Calculate deadline for real-time threads. */ if (thread->sched_mode == TH_MODE_REALTIME) { uint64_t ctime = mach_absolute_time(); thread->realtime.deadline = thread->realtime.constraint + ctime; KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0); } /* * Clear old quantum, fail-safe computation, etc. */ thread->quantum_remaining = 0; thread->computation_metered = 0; thread->reason = AST_NONE; thread->block_hint = kThreadWaitNone; /* Obtain power-relevant interrupt and "platform-idle exit" statistics. * We also account for "double hop" thread signaling via * the thread callout infrastructure. * DRK: consider removing the callout wakeup counters in the future * they're present for verification at the moment. */ if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) { DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc()); uint64_t ttd = current_processor()->timer_call_ttd; if (ttd) { if (ttd <= timer_deadline_tracking_bin_1) { thread->thread_timer_wakeups_bin_1++; } else if (ttd <= timer_deadline_tracking_bin_2) { thread->thread_timer_wakeups_bin_2++; } } ledger_credit_thread(thread, thread->t_ledger, task_ledgers.interrupt_wakeups, 1); if (pidle) { ledger_credit_thread(thread, thread->t_ledger, task_ledgers.platform_idle_wakeups, 1); } } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) { /* TODO: what about an interrupt that does a wake taken on a callout thread? */ if (cthread->callout_woken_from_icontext) { ledger_credit_thread(thread, thread->t_ledger, task_ledgers.interrupt_wakeups, 1); thread->thread_callout_interrupt_wakeups++; if (cthread->callout_woken_from_platform_idle) { ledger_credit_thread(thread, thread->t_ledger, task_ledgers.platform_idle_wakeups, 1); thread->thread_callout_platform_idle_wakeups++; } cthread->callout_woke_thread = TRUE; } } if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) { thread->callout_woken_from_icontext = !!aticontext; thread->callout_woken_from_platform_idle = !!pidle; thread->callout_woke_thread = FALSE; } #if KPERF if (ready_for_runq) { kperf_make_runnable(thread, aticontext); } #endif /* KPERF */ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, sched_run_buckets[TH_BUCKET_RUN], 0); DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc()); return ready_for_runq; } /* * Routine: thread_allowed_for_handoff * Purpose: * Check if the thread is allowed for handoff operation * Conditions: * thread lock held, IPC locks may be held. * TODO: In future, do not allow handoff if threads have different cluster * recommendations. */ boolean_t thread_allowed_for_handoff( thread_t thread) { thread_t self = current_thread(); if (allow_direct_handoff && thread->sched_mode == TH_MODE_REALTIME && self->sched_mode == TH_MODE_REALTIME) { return TRUE; } return FALSE; } /* * Routine: thread_go * Purpose: * Unblock and dispatch thread. * Conditions: * thread lock held, IPC locks may be held. * thread must have been waiting */ void thread_go( thread_t thread, wait_result_t wresult, bool try_handoff) { thread_t self = current_thread(); assert_thread_magic(thread); assert(thread->at_safe_point == FALSE); assert(thread->wait_event == NO_EVENT64); assert(waitq_is_null(thread->waitq)); assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2))); assert(thread->state & TH_WAIT); if (thread->started) { assert(thread->state & TH_WAKING); } thread_lock_assert(thread, LCK_ASSERT_OWNED); assert(ml_get_interrupts_enabled() == false); if (thread_unblock(thread, wresult)) { #if SCHED_TRACE_THREAD_WAKEUPS backtrace(&thread->thread_wakeup_bt[0], (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL, NULL); #endif /* SCHED_TRACE_THREAD_WAKEUPS */ if (try_handoff && thread_allowed_for_handoff(thread)) { thread_reference(thread); assert(self->handoff_thread == NULL); self->handoff_thread = thread; /* * A TH_RUN'ed thread must have a chosen_processor. * thread_setrun would have set it, so we need to * replicate that here. */ thread->chosen_processor = current_processor(); } else { thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); } } } /* * Routine: thread_mark_wait_locked * Purpose: * Mark a thread as waiting. If, given the circumstances, * it doesn't want to wait (i.e. already aborted), then * indicate that in the return value. * Conditions: * at splsched() and thread is locked. */ __private_extern__ wait_result_t thread_mark_wait_locked( thread_t thread, wait_interrupt_t interruptible_orig) { boolean_t at_safe_point; wait_interrupt_t interruptible = interruptible_orig; if (thread->state & TH_IDLE) { panic("Invalid attempt to wait while running the idle thread"); } assert(!(thread->state & (TH_WAIT | TH_WAKING | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT))); /* * The thread may have certain types of interrupts/aborts masked * off. Even if the wait location says these types of interrupts * are OK, we have to honor mask settings (outer-scoped code may * not be able to handle aborts at the moment). */ interruptible &= TH_OPT_INTMASK; if (interruptible > (thread->options & TH_OPT_INTMASK)) { interruptible = thread->options & TH_OPT_INTMASK; } at_safe_point = (interruptible == THREAD_ABORTSAFE); if (interruptible == THREAD_UNINT || !(thread->sched_flags & TH_SFLAG_ABORT) || (!at_safe_point && (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) { if (!(thread->state & TH_TERMINATE)) { DTRACE_SCHED(sleep); } int state_bits = TH_WAIT; if (!interruptible) { state_bits |= TH_UNINT; } if (thread->sched_call) { wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER; if (is_kerneltask(get_threadtask(thread))) { mask = THREAD_WAIT_NOREPORT_KERNEL; } if ((interruptible_orig & mask) == 0) { state_bits |= TH_WAIT_REPORT; } } thread->state |= state_bits; thread->at_safe_point = at_safe_point; /* TODO: pass this through assert_wait instead, have * assert_wait just take a struct as an argument */ assert(!thread->block_hint); thread->block_hint = thread->pending_block_hint; thread->pending_block_hint = kThreadWaitNone; return thread->wait_result = THREAD_WAITING; } else { if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) { thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK; } } thread->pending_block_hint = kThreadWaitNone; return thread->wait_result = THREAD_INTERRUPTED; } /* * Routine: thread_interrupt_level * Purpose: * Set the maximum interruptible state for the * current thread. The effective value of any * interruptible flag passed into assert_wait * will never exceed this. * * Useful for code that must not be interrupted, * but which calls code that doesn't know that. * Returns: * The old interrupt level for the thread. */ __private_extern__ wait_interrupt_t thread_interrupt_level( wait_interrupt_t new_level) { thread_t thread = current_thread(); wait_interrupt_t result = thread->options & TH_OPT_INTMASK; thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK); return result; } /* * assert_wait: * * Assert that the current thread is about to go to * sleep until the specified event occurs. */ wait_result_t assert_wait( event_t event, wait_interrupt_t interruptible) { if (__improbable(event == NO_EVENT)) { panic("%s() called with NO_EVENT", __func__); } KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0); struct waitq *waitq; waitq = global_eventq(event); return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER); } /* * assert_wait_queue: * * Return the global waitq for the specified event */ struct waitq * assert_wait_queue( event_t event) { return global_eventq(event); } wait_result_t assert_wait_timeout( event_t event, wait_interrupt_t interruptible, uint32_t interval, uint32_t scale_factor) { thread_t thread = current_thread(); wait_result_t wresult; uint64_t deadline; spl_t s; if (__improbable(event == NO_EVENT)) { panic("%s() called with NO_EVENT", __func__); } struct waitq *waitq; waitq = global_eventq(event); s = splsched(); waitq_lock(waitq); clock_interval_to_deadline(interval, scale_factor, &deadline); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread); waitq_unlock(waitq); splx(s); return wresult; } wait_result_t assert_wait_timeout_with_leeway( event_t event, wait_interrupt_t interruptible, wait_timeout_urgency_t urgency, uint32_t interval, uint32_t leeway, uint32_t scale_factor) { thread_t thread = current_thread(); wait_result_t wresult; uint64_t deadline; uint64_t abstime; uint64_t slop; uint64_t now; spl_t s; if (__improbable(event == NO_EVENT)) { panic("%s() called with NO_EVENT", __func__); } now = mach_absolute_time(); clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime); deadline = now + abstime; clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop); struct waitq *waitq; waitq = global_eventq(event); s = splsched(); waitq_lock(waitq); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, urgency, deadline, slop, thread); waitq_unlock(waitq); splx(s); return wresult; } wait_result_t assert_wait_deadline( event_t event, wait_interrupt_t interruptible, uint64_t deadline) { thread_t thread = current_thread(); wait_result_t wresult; spl_t s; if (__improbable(event == NO_EVENT)) { panic("%s() called with NO_EVENT", __func__); } struct waitq *waitq; waitq = global_eventq(event); s = splsched(); waitq_lock(waitq); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread); waitq_unlock(waitq); splx(s); return wresult; } wait_result_t assert_wait_deadline_with_leeway( event_t event, wait_interrupt_t interruptible, wait_timeout_urgency_t urgency, uint64_t deadline, uint64_t leeway) { thread_t thread = current_thread(); wait_result_t wresult; spl_t s; if (__improbable(event == NO_EVENT)) { panic("%s() called with NO_EVENT", __func__); } struct waitq *waitq; waitq = global_eventq(event); s = splsched(); waitq_lock(waitq); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, urgency, deadline, leeway, thread); waitq_unlock(waitq); splx(s); return wresult; } void sched_cond_init( sched_cond_atomic_t *cond) { os_atomic_init(cond, SCHED_COND_INIT); } wait_result_t sched_cond_wait_parameter( sched_cond_atomic_t *cond, wait_interrupt_t interruptible, thread_continue_t continuation, void *parameter) { assert_wait((event_t) cond, interruptible); /* clear active bit to indicate future wakeups will have to unblock this thread */ sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed); if (__improbable(new_state & SCHED_COND_WAKEUP)) { /* a wakeup has been issued; undo wait assertion, ack the wakeup, and return */ thread_t thread = current_thread(); clear_wait(thread, THREAD_AWAKENED); sched_cond_ack(cond); return THREAD_AWAKENED; } return thread_block_parameter(continuation, parameter); } wait_result_t sched_cond_wait( sched_cond_atomic_t *cond, wait_interrupt_t interruptible, thread_continue_t continuation) { return sched_cond_wait_parameter(cond, interruptible, continuation, NULL); } sched_cond_t sched_cond_ack( sched_cond_atomic_t *cond) { sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE | SCHED_COND_WAKEUP, acquire); assert(new_cond & SCHED_COND_ACTIVE); return new_cond; } kern_return_t sched_cond_signal( sched_cond_atomic_t *cond, thread_t thread) { disable_preemption(); sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release); if (!(old_cond & (SCHED_COND_WAKEUP | SCHED_COND_ACTIVE))) { /* this was the first wakeup to be issued AND the thread was inactive */ thread_wakeup_thread((event_t) cond, thread); } enable_preemption(); return KERN_SUCCESS; } /* * thread_isoncpu: * * Return TRUE if a thread is running on a processor such that an AST * is needed to pull it out of userspace execution, or if executing in * the kernel, bring to a context switch boundary that would cause * thread state to be serialized in the thread PCB. * * Thread locked, returns the same way. While locked, fields * like "state" cannot change. "runq" can change only from set to unset. */ static inline boolean_t thread_isoncpu(thread_t thread) { /* Not running or runnable */ if (!(thread->state & TH_RUN)) { return FALSE; } /* Waiting on a runqueue, not currently running */ /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */ /* TODO: This can also be incorrect for `handoff` cases where * the thread is never enqueued on the runq */ if (thread_get_runq(thread) != PROCESSOR_NULL) { return FALSE; } /* * Thread does not have a stack yet * It could be on the stack alloc queue or preparing to be invoked */ if (!thread->kernel_stack) { return FALSE; } /* * Thread must be running on a processor, or * about to run, or just did run. In all these * cases, an AST to the processor is needed * to guarantee that the thread is kicked out * of userspace and the processor has * context switched (and saved register state). */ return TRUE; } /* * thread_stop: * * Force a preemption point for a thread and wait * for it to stop running on a CPU. If a stronger * guarantee is requested, wait until no longer * runnable. Arbitrates access among * multiple stop requests. (released by unstop) * * The thread must enter a wait state and stop via a * separate means. * * Returns FALSE if interrupted. */ boolean_t thread_stop( thread_t thread, boolean_t until_not_runnable) { wait_result_t wresult; spl_t s = splsched(); boolean_t oncpu; wake_lock(thread); thread_lock(thread); while (thread->state & TH_SUSP) { thread->wake_active = TRUE; thread_unlock(thread); wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE); wake_unlock(thread); splx(s); if (wresult == THREAD_WAITING) { wresult = thread_block(THREAD_CONTINUE_NULL); } if (wresult != THREAD_AWAKENED) { return FALSE; } s = splsched(); wake_lock(thread); thread_lock(thread); } thread->state |= TH_SUSP; while ((oncpu = thread_isoncpu(thread)) || (until_not_runnable && (thread->state & TH_RUN))) { if (oncpu) { /* * TODO: chosen_processor isn't really the right * thing to IPI here. We really want `last_processor`, * but we also want to know where to send the IPI * *before* thread_invoke sets last_processor. * * rdar://47149497 (thread_stop doesn't IPI the right core) */ assert(thread->state & TH_RUN); processor_t processor = thread->chosen_processor; assert(processor != PROCESSOR_NULL); cause_ast_check(processor); } thread->wake_active = TRUE; thread_unlock(thread); wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE); wake_unlock(thread); splx(s); if (wresult == THREAD_WAITING) { wresult = thread_block(THREAD_CONTINUE_NULL); } if (wresult != THREAD_AWAKENED) { thread_unstop(thread); return FALSE; } s = splsched(); wake_lock(thread); thread_lock(thread); } thread_unlock(thread); wake_unlock(thread); splx(s); /* * We return with the thread unlocked. To prevent it from * transitioning to a runnable state (or from TH_RUN to * being on the CPU), the caller must ensure the thread * is stopped via an external means (such as an AST) */ return TRUE; } /* * thread_unstop: * * Release a previous stop request and set * the thread running if appropriate. * * Use only after a successful stop operation. */ void thread_unstop( thread_t thread) { spl_t s = splsched(); wake_lock(thread); thread_lock(thread); assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP); if (thread->state & TH_SUSP) { thread->state &= ~TH_SUSP; if (thread->wake_active) { thread->wake_active = FALSE; thread_unlock(thread); thread_wakeup(&thread->wake_active); wake_unlock(thread); splx(s); return; } } thread_unlock(thread); wake_unlock(thread); splx(s); } /* * thread_wait: * * Wait for a thread to stop running. (non-interruptible) * */ void thread_wait( thread_t thread, boolean_t until_not_runnable) { wait_result_t wresult; boolean_t oncpu; processor_t processor; spl_t s = splsched(); wake_lock(thread); thread_lock(thread); /* * Wait until not running on a CPU. If stronger requirement * desired, wait until not runnable. Assumption: if thread is * on CPU, then TH_RUN is set, so we're not waiting in any case * where the original, pure "TH_RUN" check would have let us * finish. */ while ((oncpu = thread_isoncpu(thread)) || (until_not_runnable && (thread->state & TH_RUN))) { if (oncpu) { assert(thread->state & TH_RUN); processor = thread->chosen_processor; cause_ast_check(processor); } thread->wake_active = TRUE; thread_unlock(thread); wresult = assert_wait(&thread->wake_active, THREAD_UNINT); wake_unlock(thread); splx(s); if (wresult == THREAD_WAITING) { thread_block(THREAD_CONTINUE_NULL); } s = splsched(); wake_lock(thread); thread_lock(thread); } thread_unlock(thread); wake_unlock(thread); splx(s); } /* * Routine: clear_wait_internal * * Clear the wait condition for the specified thread. * Start the thread executing if that is appropriate. * Arguments: * thread thread to awaken * result Wakeup result the thread should see * Conditions: * At splsched * the thread is locked. * Returns: * KERN_SUCCESS thread was rousted out a wait * KERN_FAILURE thread was waiting but could not be rousted * KERN_NOT_WAITING thread was not waiting */ __private_extern__ kern_return_t clear_wait_internal( thread_t thread, wait_result_t wresult) { waitq_t waitq = thread->waitq; if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) { return KERN_FAILURE; } /* * Check that the thread is waiting and not waking, as a waking thread * has already cleared its waitq, and is destined to be go'ed, don't * need to do it again. */ if ((thread->state & (TH_WAIT | TH_TERMINATE | TH_WAKING)) != TH_WAIT) { assert(waitq_is_null(thread->waitq)); return KERN_NOT_WAITING; } /* may drop and retake the thread lock */ if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) { return KERN_NOT_WAITING; } thread_go(thread, wresult, /* handoff */ false); return KERN_SUCCESS; } /* * clear_wait: * * Clear the wait condition for the specified thread. Start the thread * executing if that is appropriate. * * parameters: * thread thread to awaken * result Wakeup result the thread should see */ kern_return_t clear_wait( thread_t thread, wait_result_t result) { kern_return_t ret; spl_t s; s = splsched(); thread_lock(thread); ret = clear_wait_internal(thread, result); if (thread == current_thread()) { /* * The thread must be ready to wait again immediately * after clearing its own wait. */ assert((thread->state & TH_WAKING) == 0); } thread_unlock(thread); splx(s); return ret; } /* * thread_wakeup_prim: * * Common routine for thread_wakeup, thread_wakeup_with_result, * and thread_wakeup_one. * */ kern_return_t thread_wakeup_prim( event_t event, boolean_t one_thread, wait_result_t result) { if (__improbable(event == NO_EVENT)) { panic("%s() called with NO_EVENT", __func__); } struct waitq *wq = global_eventq(event); if (one_thread) { return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT); } else { return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT); } } /* * Wakeup a specified thread if and only if it's waiting for this event */ kern_return_t thread_wakeup_thread( event_t event, thread_t thread) { if (__improbable(event == NO_EVENT)) { panic("%s() called with NO_EVENT", __func__); } if (__improbable(thread == THREAD_NULL)) { panic("%s() called with THREAD_NULL", __func__); } struct waitq *wq = global_eventq(event); return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED); } /* * Wakeup a thread waiting on an event and promote it to a priority. * * Requires woken thread to un-promote itself when done. */ kern_return_t thread_wakeup_one_with_pri( event_t event, int priority) { if (__improbable(event == NO_EVENT)) { panic("%s() called with NO_EVENT", __func__); } struct waitq *wq = global_eventq(event); return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); } /* * Wakeup a thread waiting on an event, * promote it to a priority, * and return a reference to the woken thread. * * Requires woken thread to un-promote itself when done. */ thread_t thread_wakeup_identify(event_t event, int priority) { if (__improbable(event == NO_EVENT)) { panic("%s() called with NO_EVENT", __func__); } struct waitq *wq = global_eventq(event); return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); } /* * thread_bind: * * Force the current thread to execute on the specified processor. * Takes effect after the next thread_block(). * * Returns the previous binding. PROCESSOR_NULL means * not bound. * * XXX - DO NOT export this to users - XXX */ processor_t thread_bind( processor_t processor) { thread_t self = current_thread(); processor_t prev; spl_t s; s = splsched(); thread_lock(self); prev = thread_bind_internal(self, processor); thread_unlock(self); splx(s); return prev; } void thread_bind_during_wakeup(thread_t thread, processor_t processor) { assert(!ml_get_interrupts_enabled()); assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING)); #if MACH_ASSERT thread_lock_assert(thread, LCK_ASSERT_OWNED); #endif if (thread->bound_processor != processor) { thread_bind_internal(thread, processor); } } void thread_unbind_after_queue_shutdown( thread_t thread, processor_t processor __assert_only) { assert(!ml_get_interrupts_enabled()); thread_lock(thread); if (thread->bound_processor) { bool removed; assert(thread->bound_processor == processor); removed = thread_run_queue_remove(thread); /* * we can always unbind even if we didn't really remove the * thread from the runqueue */ thread_bind_internal(thread, PROCESSOR_NULL); if (removed) { thread_run_queue_reinsert(thread, SCHED_TAILQ); } } thread_unlock(thread); } /* * thread_bind_internal: * * If the specified thread is not the current thread, and it is currently * running on another CPU, a remote AST must be sent to that CPU to cause * the thread to migrate to its bound processor. Otherwise, the migration * will occur at the next quantum expiration or blocking point. * * When the thread is the current thread, and explicit thread_block() should * be used to force the current processor to context switch away and * let the thread migrate to the bound processor. * * Thread must be locked, and at splsched. */ static processor_t thread_bind_internal( thread_t thread, processor_t processor) { processor_t prev; /* */ assert(thread->sched_pri < BASEPRI_RTQUEUES); /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */ thread_assert_runq_null(thread); KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? processor->cpu_id : ~0ul, 0, 0, 0); prev = thread->bound_processor; thread->bound_processor = processor; return prev; } /* * thread_vm_bind_group_add: * * The "VM bind group" is a special mechanism to mark a collection * of threads from the VM subsystem that, in general, should be scheduled * with only one CPU of parallelism. To accomplish this, we initially * bind all the threads to the master processor, which has the effect * that only one of the threads in the group can execute at once, including * preempting threads in the group that are a lower priority. Future * mechanisms may use more dynamic mechanisms to prevent the collection * of VM threads from using more CPU time than desired. * * The current implementation can result in priority inversions where * compute-bound priority 95 or realtime threads that happen to have * landed on the master processor prevent the VM threads from running. * When this situation is detected, we unbind the threads for one * scheduler tick to allow the scheduler to run the threads an * additional CPUs, before restoring the binding (assuming high latency * is no longer a problem). */ /* * The current max is provisioned for: * vm_compressor_swap_trigger_thread (92) * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE * vm_pageout_continue (92) * memorystatus_thread (95) */ #define MAX_VM_BIND_GROUP_COUNT (5) decl_simple_lock_data(static, sched_vm_group_list_lock); static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT]; static int sched_vm_group_thread_count; static boolean_t sched_vm_group_temporarily_unbound = FALSE; void thread_vm_bind_group_add(void) { thread_t self = current_thread(); if (support_bootcpu_shutdown) { /* * Bind group is not supported without an always-on * processor to bind to. If we need these to coexist, * we'd need to dynamically move the group to * another processor as it shuts down, or build * a different way to run a set of threads * without parallelism. */ return; } thread_reference(self); self->options |= TH_OPT_SCHED_VM_GROUP; simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL); assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT); sched_vm_group_thread_list[sched_vm_group_thread_count++] = self; simple_unlock(&sched_vm_group_list_lock); thread_bind(master_processor); /* Switch to bound processor if not already there */ thread_block(THREAD_CONTINUE_NULL); } static void sched_vm_group_maintenance(void) { uint64_t ctime = mach_absolute_time(); uint64_t longtime = ctime - sched_tick_interval; int i; spl_t s; boolean_t high_latency_observed = FALSE; boolean_t runnable_and_not_on_runq_observed = FALSE; boolean_t bind_target_changed = FALSE; processor_t bind_target = PROCESSOR_NULL; /* Make sure nobody attempts to add new threads while we are enumerating them */ simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL); s = splsched(); for (i = 0; i < sched_vm_group_thread_count; i++) { thread_t thread = sched_vm_group_thread_list[i]; assert(thread != THREAD_NULL); thread_lock(thread); if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) { if (thread_get_runq(thread) != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) { high_latency_observed = TRUE; } else if (thread_get_runq(thread) == PROCESSOR_NULL) { /* There are some cases where a thread be transitiong that also fall into this case */ runnable_and_not_on_runq_observed = TRUE; } } thread_unlock(thread); if (high_latency_observed && runnable_and_not_on_runq_observed) { /* All the things we are looking for are true, stop looking */ break; } } splx(s); if (sched_vm_group_temporarily_unbound) { /* If we turned off binding, make sure everything is OK before rebinding */ if (!high_latency_observed) { /* rebind */ bind_target_changed = TRUE; bind_target = master_processor; sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */ } } else { /* * Check if we're in a bad state, which is defined by high * latency with no core currently executing a thread. If a * single thread is making progress on a CPU, that means the * binding concept to reduce parallelism is working as * designed. */ if (high_latency_observed && !runnable_and_not_on_runq_observed) { /* unbind */ bind_target_changed = TRUE; bind_target = PROCESSOR_NULL; sched_vm_group_temporarily_unbound = TRUE; } } if (bind_target_changed) { s = splsched(); for (i = 0; i < sched_vm_group_thread_count; i++) { thread_t thread = sched_vm_group_thread_list[i]; boolean_t removed; assert(thread != THREAD_NULL); thread_lock(thread); removed = thread_run_queue_remove(thread); if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) { thread_bind_internal(thread, bind_target); } else { /* * Thread was in the middle of being context-switched-to, * or was in the process of blocking. To avoid switching the bind * state out mid-flight, defer the change if possible. */ if (bind_target == PROCESSOR_NULL) { thread_bind_internal(thread, bind_target); } else { sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */ } } if (removed) { thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ); } thread_unlock(thread); } splx(s); } simple_unlock(&sched_vm_group_list_lock); } #if defined(__x86_64__) #define SCHED_AVOID_CPU0 1 #else #define SCHED_AVOID_CPU0 0 #endif int sched_allow_rt_smt = 1; int sched_avoid_cpu0 = SCHED_AVOID_CPU0; int sched_allow_rt_steal = 1; int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */ int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS; int sched_get_rt_n_backup_processors(void) { return sched_rt_n_backup_processors; } void sched_set_rt_n_backup_processors(int n) { if (n < 0) { n = 0; } else if (n > SCHED_MAX_BACKUP_PROCESSORS) { n = SCHED_MAX_BACKUP_PROCESSORS; } sched_rt_n_backup_processors = n; } int sched_rt_runq_strict_priority = false; inline static processor_set_t change_locked_pset(processor_set_t current_pset, processor_set_t new_pset) { if (current_pset != new_pset) { pset_unlock(current_pset); pset_lock(new_pset); } return new_pset; } /* * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT * rebalancing opportunity exists when a core is (instantaneously) idle, but * other SMT-capable cores may be over-committed. TODO: some possible negatives: * IPI thrash if this core does not remain idle following the load balancing ASTs * Idle "thrash", when IPI issue is followed by idle entry/core power down * followed by a wakeup shortly thereafter. */ #if (DEVELOPMENT || DEBUG) int sched_smt_balance = 1; #endif /* Invoked with pset locked, returns with pset unlocked */ bool sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) { processor_t ast_processor = NULL; #if (DEVELOPMENT || DEBUG) if (__improbable(sched_smt_balance == 0)) { goto smt_balance_exit; } #endif assert(cprocessor == current_processor()); if (cprocessor->is_SMT == FALSE) { goto smt_balance_exit; } processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary; /* Determine if both this processor and its sibling are idle, * indicating an SMT rebalancing opportunity. */ if (sib_processor->state != PROCESSOR_IDLE) { goto smt_balance_exit; } processor_t sprocessor; sched_ipi_type_t ipi_type = SCHED_IPI_NONE; uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] & ~cpset->primary_map); for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) { sprocessor = processor_array[cpuid]; if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) && (sprocessor->current_pri < BASEPRI_RTQUEUES)) { ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL); if (ipi_type != SCHED_IPI_NONE) { assert(sprocessor != cprocessor); ast_processor = sprocessor; break; } } } smt_balance_exit: pset_unlock(cpset); if (ast_processor) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0); sched_ipi_perform(ast_processor, ipi_type); } return false; } static cpumap_t pset_available_cpumap(processor_set_t pset) { return pset->cpu_available_map & pset->recommended_bitmask; } int pset_available_cpu_count(processor_set_t pset) { return bit_count(pset_available_cpumap(pset)); } bool pset_is_recommended(processor_set_t pset) { if (!pset) { return false; } return pset_available_cpu_count(pset) > 0; } bool pset_type_is_recommended(processor_set_t pset) { if (!pset) { return false; } pset_map_t recommended_psets = os_atomic_load(&pset->node->pset_recommended_map, relaxed); return bit_count(recommended_psets) > 0; } static cpumap_t pset_available_but_not_running_cpumap(processor_set_t pset) { return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) & pset->recommended_bitmask; } bool pset_has_stealable_threads(processor_set_t pset) { pset_assert_locked(pset); cpumap_t avail_map = pset_available_but_not_running_cpumap(pset); /* * Secondary CPUs never steal, so allow stealing of threads if there are more threads than * available primary CPUs */ avail_map &= pset->primary_map; return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map)); } static cpumap_t pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset) { cpumap_t avail_map = pset_available_cpumap(pset); if (!sched_allow_rt_smt) { /* * Secondary CPUs are not allowed to run RT threads, so * only primary CPUs should be included */ avail_map &= pset->primary_map; } return avail_map & ~pset->realtime_map; } static bool pset_needs_a_followup_IPI(processor_set_t pset) { int nbackup_cpus = 0; if (rt_runq_is_low_latency(pset)) { nbackup_cpus = sched_rt_n_backup_processors; } int rt_rq_count = rt_runq_count(pset); return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0); } bool pset_has_stealable_rt_threads(processor_set_t pset) { pset_node_t node = pset->node; if (bit_count(node->pset_map) == 1) { return false; } cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset); return rt_runq_count(pset) > bit_count(avail_map); } static void pset_update_rt_stealable_state(processor_set_t pset) { if (pset_has_stealable_rt_threads(pset)) { pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset); } else { pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE; } } static void clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number) { /* Acknowledge any pending IPIs here with pset lock held */ pset_assert_locked(pset); if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number); } bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id); #if defined(CONFIG_SCHED_DEFERRED_AST) bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id); #endif } /* * Called with pset locked, on a processor that is committing to run a new thread * Will transition an idle or dispatching processor to running as it picks up * the first new thread from the idle thread. */ static void pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread) { pset_assert_locked(pset); if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) { assert(current_thread() == processor->idle_thread); /* * Dispatching processor is now committed to running new_thread, * so change its state to PROCESSOR_RUNNING. */ pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); } else { assert(processor->state == PROCESSOR_RUNNING); } processor_state_update_from_thread(processor, new_thread, true); if (new_thread->sched_pri >= BASEPRI_RTQUEUES) { bit_set(pset->realtime_map, processor->cpu_id); } else { bit_clear(pset->realtime_map, processor->cpu_id); } pset_update_rt_stealable_state(pset); pset_node_t node = pset->node; if (bit_count(node->pset_map) == 1) { /* Node has only a single pset, so skip node pset map updates */ return; } cpumap_t avail_map = pset_available_cpumap(pset); if (new_thread->sched_pri >= BASEPRI_RTQUEUES) { if ((avail_map & pset->realtime_map) == avail_map) { /* No more non-RT CPUs in this pset */ atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed); } avail_map &= pset->primary_map; if ((avail_map & pset->realtime_map) == avail_map) { /* No more non-RT primary CPUs in this pset */ atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed); } } else { if ((avail_map & pset->realtime_map) != avail_map) { if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) { atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed); } } avail_map &= pset->primary_map; if ((avail_map & pset->realtime_map) != avail_map) { if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) { atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed); } } } } static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills); static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus); static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries); #if defined(__x86_64__) static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups); static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups); #endif static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup); static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor); static bool other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline) { pset_map_t pset_map = stealing_pset->node->pset_map; bit_clear(pset_map, stealing_pset->pset_id); for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) { processor_set_t nset = pset_array[pset_id]; if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) { return true; } } return false; } /* * starting_pset must be locked, but returns true if it is unlocked before return */ static bool choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi, processor_t *result_processor, sched_ipi_type_t *result_ipi_type) { bool starting_pset_is_unlocked = false; uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset); int max_pri = rt_runq_priority(starting_pset); __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq)); processor_set_t pset = starting_pset; processor_t next_rt_processor = PROCESSOR_NULL; if (spill_ipi) { processor_set_t nset = next_pset(pset); assert(nset != starting_pset); pset = change_locked_pset(pset, nset); starting_pset_is_unlocked = true; } do { const bool consider_secondaries = true; next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries); if (next_rt_processor == PROCESSOR_NULL) { if (!spill_ipi) { break; } processor_set_t nset = next_pset(pset); if (nset == starting_pset) { break; } pset = change_locked_pset(pset, nset); starting_pset_is_unlocked = true; } } while (next_rt_processor == PROCESSOR_NULL); if (next_rt_processor) { if (pset != starting_pset) { if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START, next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid); } } *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT); *result_processor = next_rt_processor; } if (pset != starting_pset) { pset_unlock(pset); } return starting_pset_is_unlocked; } /* * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond * followup processor - used in thread_select when there are still threads on the run queue and available processors * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue */ typedef enum { none, backup, followup, spill } next_processor_type_t; #undef LOOP_COUNT #ifdef LOOP_COUNT int max_loop_count[MAX_SCHED_CPUS] = { 0 }; #endif /* * thread_select: * * Select a new thread for the current processor to execute. * * May select the current thread, which must be locked. */ static thread_t thread_select(thread_t thread, processor_t processor, ast_t *reason) { processor_set_t pset = processor->processor_set; thread_t new_thread = THREAD_NULL; assert(processor == current_processor()); assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN); KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START, 0, pset->pending_AST_URGENT_cpu_mask, 0, 0); __kdebug_only int idle_reason = 0; __kdebug_only int delay_count = 0; #if defined(__x86_64__) int timeout_count = sched_backup_cpu_timeout_count; if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) { /* Prefer cpu0 as backup */ timeout_count--; } else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) { /* Prefer secondary cpu as backup */ timeout_count--; } #endif bool pending_AST_URGENT = false; bool pending_AST_PREEMPT = false; #ifdef LOOP_COUNT int loop_count = -1; #endif do { /* * Update the priority. */ if (SCHED(can_update_priority)(thread)) { SCHED(update_priority)(thread); } pset_lock(pset); restart: #ifdef LOOP_COUNT loop_count++; if (loop_count > max_loop_count[processor->cpu_id]) { max_loop_count[processor->cpu_id] = loop_count; if (bit_count(loop_count) == 1) { kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count); } } #endif pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id); pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id); processor_state_update_from_thread(processor, thread, true); idle_reason = 0; processor_t ast_processor = PROCESSOR_NULL; processor_t next_rt_processor = PROCESSOR_NULL; sched_ipi_type_t ipi_type = SCHED_IPI_NONE; sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE; assert(processor->state != PROCESSOR_OFF_LINE); /* * Bound threads are dispatched to a processor without going through * choose_processor(), so in those cases we must continue trying to dequeue work * as we are the only option. */ if (!SCHED(processor_bound_count)(processor)) { if (!processor->is_recommended) { /* * The performance controller has provided a hint to not dispatch more threads, */ idle_reason = 1; goto send_followup_ipi_before_idle; } else if (rt_runq_count(pset)) { bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false); /* Give the current RT thread a chance to complete */ ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice); #if defined(__x86_64__) /* * On Intel we want to avoid SMT secondary processors and processor 0 * but allow them to be used as backup processors in case the preferred chosen * processor is delayed by interrupts or processor stalls. So if it is * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false)) * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true)) * we delay up to (timeout_count * 10us) to give the preferred processor chance * to grab the thread before the (current) backup processor does. * * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count * on DEVELOPMENT || DEBUG kernels. It is also adjusted (see above) depending on whether we want to use * cpu0 before secondary cpus or not. */ if (!ok_to_run_realtime_thread) { if (sched_ok_to_run_realtime_thread(pset, processor, true)) { if (timeout_count-- > 0) { pset_unlock(pset); thread_unlock(thread); delay(10); delay_count++; thread_lock(thread); pset_lock(pset); goto restart; } ok_to_run_realtime_thread = true; } } #endif if (!ok_to_run_realtime_thread) { idle_reason = 2; goto send_followup_ipi_before_idle; } } else if (processor->processor_primary != processor) { /* * Should this secondary SMT processor attempt to find work? For pset runqueue systems, * we should look for work only under the same conditions that choose_processor() * would have assigned work, which is when all primary processors have been assigned work. */ if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) { /* There are idle primaries */ idle_reason = 3; goto idle; } } } /* * Test to see if the current thread should continue * to run on this processor. Must not be attempting to wait, and not * bound to a different processor, nor be in the wrong * processor set, nor be forced to context switch by TH_SUSP. * * Note that there are never any RT threads in the regular runqueue. * * This code is very insanely tricky. */ /* i.e. not waiting, not TH_SUSP'ed */ bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN); /* * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads. * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors' * * * A yielding thread shouldn't be forced to context switch. */ bool is_yielding = (*reason & AST_YIELD) == AST_YIELD; bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor; bool affinity_mismatch = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset; bool bound_elsewhere = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor; bool avoid_processor = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, *reason); bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true); bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor); if (current_thread_can_keep_running) { /* * This thread is eligible to keep running on this processor. * * RT threads with un-expired quantum stay on processor, * unless there's a valid RT thread with an earlier deadline * and it is still ok_to_run_realtime_thread. */ if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) { /* * Pick a new RT thread only if ok_to_run_realtime_thread * (but the current thread is allowed to complete). */ if (ok_to_run_realtime_thread) { if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { goto pick_new_rt_thread; } if (rt_runq_priority(pset) > thread->sched_pri) { if (sched_rt_runq_strict_priority) { /* The next RT thread is better, so pick it off the runqueue. */ goto pick_new_rt_thread; } /* * See if the current lower priority thread can continue to run without causing * the higher priority thread on the runq queue to miss its deadline. */ thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset)); if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) { /* The next RT thread is better, so pick it off the runqueue. */ goto pick_new_rt_thread; } } else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) { /* The next RT thread is better, so pick it off the runqueue. */ goto pick_new_rt_thread; } if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) { goto pick_new_rt_thread; } } /* This is still the best RT thread to run. */ processor->deadline = thread->realtime.deadline; sched_update_pset_load_average(pset, 0); clear_pending_AST_bits(pset, processor, 1); next_rt_processor = PROCESSOR_NULL; next_rt_ipi_type = SCHED_IPI_NONE; bool pset_unlocked = false; __kdebug_only next_processor_type_t nptype = none; if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) { nptype = spill; pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type); } else if (pset_needs_a_followup_IPI(pset)) { nptype = followup; pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type); } if (!pset_unlocked) { pset_unlock(pset); } if (next_rt_processor) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE, next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2); sched_ipi_perform(next_rt_processor, next_rt_ipi_type); } KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1); return thread; } if ((rt_runq_count(pset) == 0) && SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) { /* This thread is still the highest priority runnable (non-idle) thread */ processor->deadline = RT_DEADLINE_NONE; sched_update_pset_load_average(pset, 0); clear_pending_AST_bits(pset, processor, 2); pset_unlock(pset); KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2); return thread; } } else { /* * This processor must context switch. * If it's due to a rebalance, we should aggressively find this thread a new home. */ if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) { *reason |= AST_REBALANCE; } } bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) && (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) && (processor->processor_secondary->state == PROCESSOR_IDLE)); /* OK, so we're not going to run the current thread. Look at the RT queue. */ if (ok_to_run_realtime_thread) { pick_new_rt_thread: new_thread = sched_rt_choose_thread(pset); if (new_thread != THREAD_NULL) { processor->deadline = new_thread->realtime.deadline; pset_commit_processor_to_new_thread(pset, processor, new_thread); clear_pending_AST_bits(pset, processor, 3); if (processor->processor_secondary != NULL) { processor_t sprocessor = processor->processor_secondary; if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) { ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL); ast_processor = sprocessor; } } } } send_followup_ipi_before_idle: /* This might not have been cleared if we didn't call sched_rt_choose_thread() */ if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5); } __kdebug_only next_processor_type_t nptype = none; bool pset_unlocked = false; if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) { nptype = spill; pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type); } else if (pset_needs_a_followup_IPI(pset)) { nptype = followup; pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type); } assert(new_thread || !ast_processor); if (new_thread || next_rt_processor) { if (!pset_unlocked) { pset_unlock(pset); pset_unlocked = true; } if (ast_processor == next_rt_processor) { ast_processor = PROCESSOR_NULL; ipi_type = SCHED_IPI_NONE; } if (ast_processor) { sched_ipi_perform(ast_processor, ipi_type); } if (next_rt_processor) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE, next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3); sched_ipi_perform(next_rt_processor, next_rt_ipi_type); } if (new_thread) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END, (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3); return new_thread; } } if (pset_unlocked) { pset_lock(pset); } if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { /* Things changed while we dropped the lock */ goto restart; } if (processor->is_recommended) { bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id); if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) { /* Things changed while we dropped the lock */ goto restart; } if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) { /* secondary can only run realtime thread */ if (idle_reason == 0) { idle_reason = 4; } goto idle; } } else if (!SCHED(processor_bound_count)(processor)) { /* processor not recommended and no bound threads */ if (idle_reason == 0) { idle_reason = 5; } goto idle; } processor->deadline = RT_DEADLINE_NONE; /* No RT threads, so let's look at the regular threads. */ if ((new_thread = SCHED(choose_thread)(processor, MINPRI, current_thread_can_keep_running ? thread : THREAD_NULL, *reason)) != THREAD_NULL) { if (new_thread != thread) { /* Going to context-switch */ pset_commit_processor_to_new_thread(pset, processor, new_thread); clear_pending_AST_bits(pset, processor, 4); ast_processor = PROCESSOR_NULL; ipi_type = SCHED_IPI_NONE; processor_t sprocessor = processor->processor_secondary; if (sprocessor != NULL) { if (sprocessor->state == PROCESSOR_RUNNING) { if (thread_no_smt(new_thread)) { ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL); ast_processor = sprocessor; } } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) { ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT); ast_processor = sprocessor; } } pset_unlock(pset); if (ast_processor) { sched_ipi_perform(ast_processor, ipi_type); } } else { /* Will continue running the current thread */ clear_pending_AST_bits(pset, processor, 4); pset_unlock(pset); } KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END, (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4); return new_thread; } if (processor->must_idle) { processor->must_idle = false; *reason |= AST_REBALANCE; idle_reason = 6; goto idle; } if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) { /* * No runnable threads, attempt to steal * from other processors. Returns with pset lock dropped. */ if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) { pset_lock(pset); pset_commit_processor_to_new_thread(pset, processor, new_thread); if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { /* * A realtime thread choose this processor while it was DISPATCHING * and the pset lock was dropped */ ast_on(AST_URGENT | AST_PREEMPT); } clear_pending_AST_bits(pset, processor, 5); pset_unlock(pset); KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END, (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5); return new_thread; } /* * If other threads have appeared, shortcut * around again. */ if (SCHED(processor_bound_count)(processor)) { continue; } if (processor->is_recommended) { if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) { continue; } } pset_lock(pset); } idle: /* Someone selected this processor while we had dropped the lock */ if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) || (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) { goto restart; } if ((idle_reason == 0) && current_thread_can_keep_running) { /* This thread is the only runnable (non-idle) thread */ if (thread->sched_pri >= BASEPRI_RTQUEUES) { processor->deadline = thread->realtime.deadline; } else { processor->deadline = RT_DEADLINE_NONE; } sched_update_pset_load_average(pset, 0); clear_pending_AST_bits(pset, processor, 6); KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6); pset_unlock(pset); return thread; } /* * Nothing is runnable, or this processor must be forced idle, * so set this processor idle if it was running. */ if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) { pset_update_processor_state(pset, processor, PROCESSOR_IDLE); processor_state_update_idle(processor); } pset_update_rt_stealable_state(pset); clear_pending_AST_bits(pset, processor, 7); /* Invoked with pset locked, returns with pset unlocked */ processor->next_idle_short = SCHED(processor_balance)(processor, pset); new_thread = processor->idle_thread; } while (new_thread == THREAD_NULL); KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END, (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason); return new_thread; } /* * thread_invoke * * Called at splsched with neither thread locked. * * Perform a context switch and start executing the new thread. * * Returns FALSE when the context switch didn't happen. * The reference to the new thread is still consumed. * * "self" is what is currently running on the processor, * "thread" is the new thread to context switch to * (which may be the same thread in some cases) */ static boolean_t thread_invoke( thread_t self, thread_t thread, ast_t reason) { if (__improbable(get_preemption_level() != 0)) { int pl = get_preemption_level(); panic("thread_invoke: preemption_level %d, possible cause: %s", pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" : "blocking while holding a spinlock, or within interrupt context")); } thread_continue_t continuation = self->continuation; void *parameter = self->parameter; struct recount_snap snap = { 0 }; recount_snapshot(&snap); uint64_t ctime = snap.rsn_time_mach; check_monotonic_time(ctime); #ifdef CONFIG_MACH_APPROXIMATE_TIME commpage_update_mach_approximate_time(ctime); #endif if (ctime < thread->last_made_runnable_time) { panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx", ctime, thread->last_made_runnable_time); } #if defined(CONFIG_SCHED_TIMESHARE_CORE) if (!((thread->state & TH_IDLE) != 0 || ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) { sched_timeshare_consider_maintenance(ctime, true); } #endif recount_log_switch_thread(&snap); processor_t processor = current_processor(); if (!processor->processor_online) { panic("Invalid attempt to context switch an offline processor"); } assert_thread_magic(self); assert(self == current_thread()); thread_assert_runq_null(self); assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN); thread_lock(thread); assert_thread_magic(thread); assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN); assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor); thread_assert_runq_null(thread); /* Update SFI class based on other factors */ thread->sfi_class = sfi_thread_classify(thread); /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */ thread->same_pri_latency = ctime - thread->last_basepri_change_time; /* * In case a base_pri update happened between the timestamp and * taking the thread lock */ if (ctime <= thread->last_basepri_change_time) { thread->same_pri_latency = ctime - thread->last_made_runnable_time; } /* Allow realtime threads to hang onto a stack. */ if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) { self->reserved_stack = self->kernel_stack; } /* Prepare for spin debugging */ #if SCHED_HYGIENE_DEBUG ml_spin_debug_clear(thread); #endif if (continuation != NULL) { if (!thread->kernel_stack) { /* * If we are using a privileged stack, * check to see whether we can exchange it with * that of the other thread. */ if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) { goto need_stack; } /* * Context switch by performing a stack handoff. * Requires both threads to be parked in a continuation. */ continuation = thread->continuation; parameter = thread->parameter; processor->active_thread = thread; processor_state_update_from_thread(processor, thread, false); if (thread->last_processor != processor && thread->last_processor != NULL) { if (thread->last_processor->processor_set != processor->processor_set) { thread->ps_switch++; } thread->p_switch++; } thread->last_processor = processor; thread->c_switch++; ast_context(thread); thread_unlock(thread); self->reason = reason; processor->last_dispatch = ctime; self->last_run_time = ctime; timer_update(&thread->runnable_timer, ctime); recount_switch_thread(&snap, self, get_threadtask(self)); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) { SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0); } DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc()); SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); #if KPERF kperf_off_cpu(self); #endif /* KPERF */ /* * This is where we actually switch thread identity, * and address space if required. However, register * state is not switched - this routine leaves the * stack and register state active on the current CPU. */ TLOG(1, "thread_invoke: calling stack_handoff\n"); stack_handoff(self, thread); /* 'self' is now off core */ assert(thread == current_thread_volatile()); DTRACE_SCHED(on__cpu); #if KPERF kperf_on_cpu(thread, continuation, NULL); #endif /* KPERF */ recount_log_switch_thread_on(&snap); thread_dispatch(self, thread); #if KASAN /* Old thread's stack has been moved to the new thread, so explicitly * unpoison it. */ kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size); #endif thread->continuation = thread->parameter = NULL; boolean_t enable_interrupts = TRUE; /* idle thread needs to stay interrupts-disabled */ if ((thread->state & TH_IDLE)) { enable_interrupts = FALSE; } assert(continuation); call_continuation(continuation, parameter, thread->wait_result, enable_interrupts); /*NOTREACHED*/ } else if (thread == self) { /* same thread but with continuation */ ast_context(self); thread_unlock(self); #if KPERF kperf_on_cpu(thread, continuation, NULL); #endif /* KPERF */ recount_log_switch_thread_on(&snap); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); #if KASAN /* stack handoff to self - no thread_dispatch(), so clear the stack * and free the fakestack directly */ #if KASAN_CLASSIC kasan_fakestack_drop(self); kasan_fakestack_gc(self); #endif /* KASAN_CLASSIC */ kasan_unpoison_stack(self->kernel_stack, kernel_stack_size); #endif /* KASAN */ self->continuation = self->parameter = NULL; boolean_t enable_interrupts = TRUE; /* idle thread needs to stay interrupts-disabled */ if ((self->state & TH_IDLE)) { enable_interrupts = FALSE; } call_continuation(continuation, parameter, self->wait_result, enable_interrupts); /*NOTREACHED*/ } } else { /* * Check that the other thread has a stack */ if (!thread->kernel_stack) { need_stack: if (!stack_alloc_try(thread)) { thread_unlock(thread); thread_stack_enqueue(thread); return FALSE; } } else if (thread == self) { ast_context(self); thread_unlock(self); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); return TRUE; } } /* * Context switch by full context save. */ processor->active_thread = thread; processor_state_update_from_thread(processor, thread, false); if (thread->last_processor != processor && thread->last_processor != NULL) { if (thread->last_processor->processor_set != processor->processor_set) { thread->ps_switch++; } thread->p_switch++; } thread->last_processor = processor; thread->c_switch++; ast_context(thread); thread_unlock(thread); self->reason = reason; processor->last_dispatch = ctime; self->last_run_time = ctime; timer_update(&thread->runnable_timer, ctime); recount_switch_thread(&snap, self, get_threadtask(self)); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) { SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0); } DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc()); SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); #if KPERF kperf_off_cpu(self); #endif /* KPERF */ /* * This is where we actually switch register context, * and address space if required. We will next run * as a result of a subsequent context switch. * * Once registers are switched and the processor is running "thread", * the stack variables and non-volatile registers will contain whatever * was there the last time that thread blocked. No local variables should * be used after this point, except for the special case of "thread", which * the platform layer returns as the previous thread running on the processor * via the function call ABI as a return register, and "self", which may have * been stored on the stack or a non-volatile register, but a stale idea of * what was on the CPU is newly-accurate because that thread is again * running on the CPU. * * If one of the threads is using a continuation, thread_continue * is used to stitch up its context. * * If we are invoking a thread which is resuming from a continuation, * the CPU will invoke thread_continue next. * * If the current thread is parking in a continuation, then its state * won't be saved and the stack will be discarded. When the stack is * re-allocated, it will be configured to resume from thread_continue. */ assert(continuation == self->continuation); thread = machine_switch_context(self, continuation, thread); assert(self == current_thread_volatile()); TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread); assert(continuation == NULL && self->continuation == NULL); DTRACE_SCHED(on__cpu); #if KPERF kperf_on_cpu(self, NULL, __builtin_frame_address(0)); #endif /* KPERF */ /* Previous snap on the old stack is gone. */ recount_log_switch_thread_on(NULL); /* We have been resumed and are set to run. */ thread_dispatch(thread, self); return TRUE; } #if defined(CONFIG_SCHED_DEFERRED_AST) /* * pset_cancel_deferred_dispatch: * * Cancels all ASTs that we can cancel for the given processor set * if the current processor is running the last runnable thread in the * system. * * This function assumes the current thread is runnable. This must * be called with the pset unlocked. */ static void pset_cancel_deferred_dispatch( processor_set_t pset, processor_t processor) { processor_t active_processor = NULL; uint32_t sampled_sched_run_count; pset_lock(pset); sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed); /* * If we have emptied the run queue, and our current thread is runnable, we * should tell any processors that are still DISPATCHING that they will * probably not have any work to do. In the event that there are no * pending signals that we can cancel, this is also uninteresting. * * In the unlikely event that another thread becomes runnable while we are * doing this (sched_run_count is atomically updated, not guarded), the * codepath making it runnable SHOULD (a dangerous word) need the pset lock * in order to dispatch it to a processor in our pset. So, the other * codepath will wait while we squash all cancelable ASTs, get the pset * lock, and then dispatch the freshly runnable thread. So this should be * correct (we won't accidentally have a runnable thread that hasn't been * dispatched to an idle processor), if not ideal (we may be restarting the * dispatch process, which could have some overhead). */ if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) { uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] & pset->pending_deferred_AST_cpu_mask & ~pset->pending_AST_URGENT_cpu_mask); for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) { active_processor = processor_array[cpuid]; /* * If a processor is DISPATCHING, it could be because of * a cancelable signal. * * IF the processor is not our * current processor (the current processor should not * be DISPATCHING, so this is a bit paranoid), AND there * is a cancelable signal pending on the processor, AND * there is no non-cancelable signal pending (as there is * no point trying to backtrack on bringing the processor * up if a signal we cannot cancel is outstanding), THEN * it should make sense to roll back the processor state * to the IDLE state. * * If the racey nature of this approach (as the signal * will be arbitrated by hardware, and can fire as we * roll back state) results in the core responding * despite being pushed back to the IDLE state, it * should be no different than if the core took some * interrupt while IDLE. */ if (active_processor != processor) { /* * Squash all of the processor state back to some * reasonable facsimile of PROCESSOR_IDLE. */ processor_state_update_idle(active_processor); active_processor->deadline = RT_DEADLINE_NONE; pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE); bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id); machine_signal_idle_cancel(active_processor); } } } pset_unlock(pset); } #else /* We don't support deferred ASTs; everything is candycanes and sunshine. */ #endif static void thread_csw_callout( thread_t old, thread_t new, uint64_t timestamp) { perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH; uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency; machine_switch_perfcontrol_context(event, timestamp, 0, same_pri_latency, old, new); } /* * thread_dispatch: * * Handle threads at context switch. Re-dispatch other thread * if still running, otherwise update run state and perform * special actions. Update quantum for other thread and begin * the quantum for ourselves. * * "thread" is the old thread that we have switched away from. * "self" is the new current thread that we have context switched to * * Called at splsched. * */ void thread_dispatch( thread_t thread, thread_t self) { processor_t processor = self->last_processor; bool was_idle = false; bool processor_bootstrap = (thread == THREAD_NULL); assert(processor == current_processor()); assert(self == current_thread_volatile()); assert(thread != self); if (thread != THREAD_NULL) { /* * Do the perfcontrol callout for context switch. * The reason we do this here is: * - thread_dispatch() is called from various places that are not * the direct context switch path for eg. processor shutdown etc. * So adding the callout here covers all those cases. * - We want this callout as early as possible to be close * to the timestamp taken in thread_invoke() * - We want to avoid holding the thread lock while doing the * callout * - We do not want to callout if "thread" is NULL. */ thread_csw_callout(thread, self, processor->last_dispatch); #if KASAN if (thread->continuation != NULL) { /* * Thread has a continuation and the normal stack is going away. * Unpoison the stack and mark all fakestack objects as unused. */ #if KASAN_CLASSIC kasan_fakestack_drop(thread); #endif /* KASAN_CLASSIC */ if (thread->kernel_stack) { kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size); } } #if KASAN_CLASSIC /* * Free all unused fakestack objects. */ kasan_fakestack_gc(thread); #endif /* KASAN_CLASSIC */ #endif /* KASAN */ /* * If blocked at a continuation, discard * the stack. */ if (thread->continuation != NULL && thread->kernel_stack != 0) { stack_free(thread); } if (thread->state & TH_IDLE) { was_idle = true; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), 0, thread->state, sched_run_buckets[TH_BUCKET_RUN], 0); } else { int64_t consumed; int64_t remainder = 0; if (processor->quantum_end > processor->last_dispatch) { remainder = processor->quantum_end - processor->last_dispatch; } consumed = thread->quantum_remaining - remainder; if ((thread->reason & AST_LEDGER) == 0) { /* * Bill CPU time to both the task and * the individual thread. */ ledger_credit_thread(thread, thread->t_ledger, task_ledgers.cpu_time, consumed); ledger_credit_thread(thread, thread->t_threadledger, thread_ledgers.cpu_time, consumed); if (thread->t_bankledger) { ledger_credit_thread(thread, thread->t_bankledger, bank_ledgers.cpu_time, (consumed - thread->t_deduct_bank_ledger_time)); } thread->t_deduct_bank_ledger_time = 0; if (consumed > 0) { /* * This should never be negative, but in traces we are seeing some instances * of consumed being negative. * thread_dispatch() thread CPU consumed calculation sometimes results in negative value */ sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket); } } /* For the thread that we just context switched away from, figure * out if we have expired the wq quantum and set the AST if we have */ if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) { thread_evaluate_workqueue_quantum_expiry(thread); } if (__improbable(thread->rwlock_count != 0)) { smr_mark_active_trackers_stalled(thread); } /* * Pairs with task_restartable_ranges_synchronize */ wake_lock(thread); thread_lock(thread); /* * Same as ast_check(), in case we missed the IPI */ thread_reset_pcs_ack_IPI(thread); /* * Apply a priority floor if the thread holds a kernel resource * or explicitly requested it. * Do this before checking starting_pri to avoid overpenalizing * repeated rwlock blockers. */ if (__improbable(thread->rwlock_count != 0)) { lck_rw_set_promotion_locked(thread); } if (__improbable(thread->priority_floor_count != 0)) { thread_floor_boost_set_promotion_locked(thread); } boolean_t keep_quantum = processor->first_timeslice; /* * Treat a thread which has dropped priority since it got on core * as having expired its quantum. */ if (processor->starting_pri > thread->sched_pri) { keep_quantum = FALSE; } /* Compute remainder of current quantum. */ if (keep_quantum && processor->quantum_end > processor->last_dispatch) { thread->quantum_remaining = (uint32_t)remainder; } else { thread->quantum_remaining = 0; } if (thread->sched_mode == TH_MODE_REALTIME) { /* * Cancel the deadline if the thread has * consumed the entire quantum. */ if (thread->quantum_remaining == 0) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0); thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED; } } else { #if defined(CONFIG_SCHED_TIMESHARE_CORE) /* * For non-realtime threads treat a tiny * remaining quantum as an expired quantum * but include what's left next time. */ if (thread->quantum_remaining < min_std_quantum) { thread->reason |= AST_QUANTUM; thread->quantum_remaining += SCHED(initial_quantum_size)(thread); } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ } /* * If we are doing a direct handoff then * take the remainder of the quantum. */ if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) { self->quantum_remaining = thread->quantum_remaining; thread->reason |= AST_QUANTUM; thread->quantum_remaining = 0; } thread->computation_metered += (processor->last_dispatch - thread->computation_epoch); if (!(thread->state & TH_WAIT)) { /* * Still runnable. */ thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch; machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE); ast_t reason = thread->reason; sched_options_t options = SCHED_NONE; if (reason & AST_REBALANCE) { options |= SCHED_REBALANCE; if (reason & AST_QUANTUM) { /* * Having gone to the trouble of forcing this thread off a less preferred core, * we should force the preferable core to reschedule immediately to give this * thread a chance to run instead of just sitting on the run queue where * it may just be stolen back by the idle core we just forced it off. * But only do this at the end of a quantum to prevent cascading effects. */ options |= SCHED_PREEMPT; } } if (reason & AST_QUANTUM) { options |= SCHED_TAILQ; } else if (reason & AST_PREEMPT) { options |= SCHED_HEADQ; } else { options |= (SCHED_PREEMPT | SCHED_TAILQ); } thread_setrun(thread, options); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->reason, thread->state, sched_run_buckets[TH_BUCKET_RUN], 0); if (thread->wake_active) { thread->wake_active = FALSE; thread_unlock(thread); thread_wakeup(&thread->wake_active); } else { thread_unlock(thread); } wake_unlock(thread); } else { /* * Waiting. */ boolean_t should_terminate = FALSE; uint32_t new_run_count; int thread_state = thread->state; /* Only the first call to thread_dispatch * after explicit termination should add * the thread to the termination queue */ if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) { should_terminate = TRUE; thread_state |= TH_TERMINATE2; } timer_stop(&thread->runnable_timer, processor->last_dispatch); thread_state &= ~TH_RUN; thread->state = thread_state; thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE; thread->chosen_processor = PROCESSOR_NULL; new_run_count = SCHED(run_count_decr)(thread); #if CONFIG_SCHED_AUTO_JOIN if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) { work_interval_auto_join_unwind(thread); } #endif /* CONFIG_SCHED_AUTO_JOIN */ #if CONFIG_SCHED_SFI if (thread->reason & AST_SFI) { thread->wait_sfi_begin_time = processor->last_dispatch; } #endif machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->reason, thread_state, new_run_count, 0); if (thread_state & TH_WAIT_REPORT) { (*thread->sched_call)(SCHED_CALL_BLOCK, thread); } if (thread->wake_active) { thread->wake_active = FALSE; thread_unlock(thread); thread_wakeup(&thread->wake_active); } else { thread_unlock(thread); } wake_unlock(thread); if (should_terminate) { thread_terminate_enqueue(thread); } } } /* * The thread could have been added to the termination queue, so it's * unsafe to use after this point. */ thread = THREAD_NULL; } int urgency = THREAD_URGENCY_NONE; uint64_t latency = 0; /* Update (new) current thread and reprogram running timers */ thread_lock(self); if (!(self->state & TH_IDLE)) { uint64_t arg1, arg2; #if CONFIG_SCHED_SFI ast_t new_ast; new_ast = sfi_thread_needs_ast(self, NULL); if (new_ast != AST_NONE) { ast_on(new_ast); } #endif if (processor->last_dispatch < self->last_made_runnable_time) { panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx", processor->last_dispatch, self->last_made_runnable_time); } assert(self->last_made_runnable_time <= self->last_basepri_change_time); latency = processor->last_dispatch - self->last_made_runnable_time; assert(latency >= self->same_pri_latency); urgency = thread_get_urgency(self, &arg1, &arg2); thread_tell_urgency(urgency, arg1, arg2, latency, self); /* * Start a new CPU limit interval if the previous one has * expired. This should happen before initializing a new * quantum. */ if (cpulimit_affects_quantum && thread_cpulimit_interval_has_expired(processor->last_dispatch)) { thread_cpulimit_restart(processor->last_dispatch); } /* * Get a new quantum if none remaining. */ if (self->quantum_remaining == 0) { thread_quantum_init(self, processor->last_dispatch); } /* * Set up quantum timer and timeslice. */ processor->quantum_end = processor->last_dispatch + self->quantum_remaining; running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self, processor->quantum_end, processor->last_dispatch); if (was_idle) { /* * kperf's running timer is active whenever the idle thread for a * CPU is not running. */ kperf_running_setup(processor, processor->last_dispatch); } running_timers_activate(processor); processor->first_timeslice = TRUE; } else { if (!processor_bootstrap) { running_timers_deactivate(processor); } processor->first_timeslice = FALSE; thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self); } assert(self->block_hint == kThreadWaitNone); self->computation_epoch = processor->last_dispatch; /* * This relies on the interrupt time being tallied up to the thread in the * exception handler epilogue, which is before AST context where preemption * is considered (and the scheduler is potentially invoked to * context switch, here). */ self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach(); self->reason = AST_NONE; processor->starting_pri = self->sched_pri; thread_unlock(self); machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency, processor->last_dispatch); #if defined(CONFIG_SCHED_DEFERRED_AST) /* * TODO: Can we state that redispatching our old thread is also * uninteresting? */ if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) { pset_cancel_deferred_dispatch(processor->processor_set, processor); } #endif } /* * thread_block_reason: * * Forces a reschedule, blocking the caller if a wait * has been asserted. * * If a continuation is specified, then thread_invoke will * attempt to discard the thread's kernel stack. When the * thread resumes, it will execute the continuation function * on a new kernel stack. */ wait_result_t thread_block_reason( thread_continue_t continuation, void *parameter, ast_t reason) { thread_t self = current_thread(); processor_t processor; thread_t new_thread; spl_t s; s = splsched(); processor = current_processor(); /* If we're explicitly yielding, force a subsequent quantum */ if (reason & AST_YIELD) { processor->first_timeslice = FALSE; } /* We're handling all scheduling AST's */ ast_off(AST_SCHEDULING); clear_pending_nonurgent_preemption(processor); #if PROC_REF_DEBUG if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) { uthread_assert_zero_proc_refcount(get_bsdthread_info(self)); } #endif #if CONFIG_EXCLAVES if (continuation != NULL) { assert3u(self->th_exclaves_state & TH_EXCLAVES_STATE_ANY, ==, 0); } #endif /* CONFIG_EXCLAVES */ self->continuation = continuation; self->parameter = parameter; if (self->state & ~(TH_RUN | TH_IDLE)) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK), reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0); } do { thread_lock(self); new_thread = thread_select(self, processor, &reason); thread_unlock(self); } while (!thread_invoke(self, new_thread, reason)); splx(s); return self->wait_result; } /* * thread_block: * * Block the current thread if a wait has been asserted. */ wait_result_t thread_block( thread_continue_t continuation) { return thread_block_reason(continuation, NULL, AST_NONE); } wait_result_t thread_block_parameter( thread_continue_t continuation, void *parameter) { return thread_block_reason(continuation, parameter, AST_NONE); } /* * thread_run: * * Switch directly from the current thread to the * new thread, handing off our quantum if appropriate. * * New thread must be runnable, and not on a run queue. * * Called at splsched. */ int thread_run( thread_t self, thread_continue_t continuation, void *parameter, thread_t new_thread) { ast_t reason = AST_NONE; if ((self->state & TH_IDLE) == 0) { reason = AST_HANDOFF; } /* Must not get here without a chosen processor */ assert(new_thread->chosen_processor); self->continuation = continuation; self->parameter = parameter; while (!thread_invoke(self, new_thread, reason)) { /* the handoff failed, so we have to fall back to the normal block path */ processor_t processor = current_processor(); reason = AST_NONE; thread_lock(self); new_thread = thread_select(self, processor, &reason); thread_unlock(self); } return self->wait_result; } /* * thread_continue: * * Called at splsched when a thread first receives * a new stack after a continuation. * * Called with THREAD_NULL as the old thread when * invoked by machine_load_context. */ void thread_continue( thread_t thread) { thread_t self = current_thread(); thread_continue_t continuation; void *parameter; DTRACE_SCHED(on__cpu); continuation = self->continuation; parameter = self->parameter; assert(continuation != NULL); #if KPERF kperf_on_cpu(self, continuation, NULL); #endif thread_dispatch(thread, self); self->continuation = self->parameter = NULL; #if SCHED_HYGIENE_DEBUG /* Reset interrupt-masked spin debugging timeout */ ml_spin_debug_clear(self); #endif TLOG(1, "thread_continue: calling call_continuation\n"); boolean_t enable_interrupts = TRUE; /* bootstrap thread, idle thread need to stay interrupts-disabled */ if (thread == THREAD_NULL || (self->state & TH_IDLE)) { enable_interrupts = FALSE; } #if KASAN_TBI kasan_unpoison_stack(self->kernel_stack, kernel_stack_size); #endif /* KASAN_TBI */ call_continuation(continuation, parameter, self->wait_result, enable_interrupts); /*NOTREACHED*/ } void thread_quantum_init(thread_t thread, uint64_t now) { uint64_t new_quantum = 0; switch (thread->sched_mode) { case TH_MODE_REALTIME: new_quantum = thread->realtime.computation; new_quantum = MIN(new_quantum, max_unsafe_rt_computation); break; case TH_MODE_FIXED: new_quantum = SCHED(initial_quantum_size)(thread); new_quantum = MIN(new_quantum, max_unsafe_fixed_computation); break; default: new_quantum = SCHED(initial_quantum_size)(thread); break; } if (cpulimit_affects_quantum) { const uint64_t cpulimit_remaining = thread_cpulimit_remaining(now); /* * If there's no remaining CPU time, the ledger system will * notice and put the thread to sleep. */ if (cpulimit_remaining > 0) { new_quantum = MIN(new_quantum, cpulimit_remaining); } } assert3u(new_quantum, <, UINT32_MAX); assert3u(new_quantum, >, 0); thread->quantum_remaining = (uint32_t)new_quantum; } uint32_t sched_timeshare_initial_quantum_size(thread_t thread) { if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) { return bg_quantum; } else { return std_quantum; } } /* * run_queue_init: * * Initialize a run queue before first use. */ void run_queue_init( run_queue_t rq) { rq->highq = NOPRI; for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) { rq->bitmap[i] = 0; } rq->urgency = rq->count = 0; for (int i = 0; i < NRQS; i++) { circle_queue_init(&rq->queues[i]); } } /* * run_queue_dequeue: * * Perform a dequeue operation on a run queue, * and return the resulting thread. * * The run queue must be locked (see thread_run_queue_remove() * for more info), and not empty. */ thread_t run_queue_dequeue( run_queue_t rq, sched_options_t options) { thread_t thread; circle_queue_t queue = &rq->queues[rq->highq]; if (options & SCHED_HEADQ) { thread = cqe_dequeue_head(queue, struct thread, runq_links); } else { thread = cqe_dequeue_tail(queue, struct thread, runq_links); } assert(thread != THREAD_NULL); assert_thread_magic(thread); thread_clear_runq(thread); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; if (SCHED(priority_is_urgent)(rq->highq)) { rq->urgency--; assert(rq->urgency >= 0); } if (circle_queue_empty(queue)) { bitmap_clear(rq->bitmap, rq->highq); rq->highq = bitmap_first(rq->bitmap, NRQS); } return thread; } /* * run_queue_enqueue: * * Perform a enqueue operation on a run queue. * * The run queue must be locked (see thread_run_queue_remove() * for more info). */ boolean_t run_queue_enqueue( run_queue_t rq, thread_t thread, sched_options_t options) { circle_queue_t queue = &rq->queues[thread->sched_pri]; boolean_t result = FALSE; assert_thread_magic(thread); if (circle_queue_empty(queue)) { circle_enqueue_tail(queue, &thread->runq_links); rq_bitmap_set(rq->bitmap, thread->sched_pri); if (thread->sched_pri > rq->highq) { rq->highq = thread->sched_pri; result = TRUE; } } else { if (options & SCHED_TAILQ) { circle_enqueue_tail(queue, &thread->runq_links); } else { circle_enqueue_head(queue, &thread->runq_links); } } if (SCHED(priority_is_urgent)(thread->sched_pri)) { rq->urgency++; } SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count++; return result; } /* * run_queue_remove: * * Remove a specific thread from a runqueue. * * The run queue must be locked. */ void run_queue_remove( run_queue_t rq, thread_t thread) { circle_queue_t queue = &rq->queues[thread->sched_pri]; thread_assert_runq_nonnull(thread); assert_thread_magic(thread); circle_dequeue(queue, &thread->runq_links); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; if (SCHED(priority_is_urgent)(thread->sched_pri)) { rq->urgency--; assert(rq->urgency >= 0); } if (circle_queue_empty(queue)) { /* update run queue status */ bitmap_clear(rq->bitmap, thread->sched_pri); rq->highq = bitmap_first(rq->bitmap, NRQS); } thread_clear_runq(thread); } /* * run_queue_peek * * Peek at the runq and return the highest * priority thread from the runq. * * The run queue must be locked. */ thread_t run_queue_peek( run_queue_t rq) { if (rq->count > 0) { circle_queue_t queue = &rq->queues[rq->highq]; thread_t thread = cqe_queue_first(queue, struct thread, runq_links); assert_thread_magic(thread); return thread; } else { return THREAD_NULL; } } static bool rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor) { int pri = thread->sched_pri; assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI)); int i = pri - BASEPRI_RTQUEUES; rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; bitmap_t *map = rt_run_queue->bitmap; bitmap_set(map, i); queue_t queue = &rt_runq->pri_queue; uint64_t deadline = thread->realtime.deadline; bool preempt = false; bool earliest = false; if (queue_empty(queue)) { enqueue_tail(queue, &thread->runq_links); preempt = true; earliest = true; rt_runq->pri_earliest_deadline = deadline; rt_runq->pri_constraint = thread->realtime.constraint; } else { /* Insert into rt_runq in thread deadline order */ queue_entry_t iter; qe_foreach(iter, queue) { thread_t iter_thread = qe_element(iter, struct thread, runq_links); assert_thread_magic(iter_thread); if (deadline < iter_thread->realtime.deadline) { if (iter == queue_first(queue)) { preempt = true; earliest = true; rt_runq->pri_earliest_deadline = deadline; rt_runq->pri_constraint = thread->realtime.constraint; } insque(&thread->runq_links, queue_prev(iter)); break; } else if (iter == queue_last(queue)) { enqueue_tail(queue, &thread->runq_links); break; } } } if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) { os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed); os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed); os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed); } SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed)); rt_runq->pri_count++; os_atomic_inc(&rt_run_queue->count, relaxed); thread_set_runq_locked(thread, processor); CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread); return preempt; } static thread_t rt_runq_dequeue(rt_queue_t rt_run_queue) { bitmap_t *map = rt_run_queue->bitmap; int i = bitmap_first(map, NRTQS); assert((i >= 0) && (i < NRTQS)); rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; if (!sched_rt_runq_strict_priority) { int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed); if (ed_index != i) { assert((ed_index >= 0) && (ed_index < NRTQS)); rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index]; thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links); thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) { /* choose the earliest deadline thread */ rt_runq = ed_runq; i = ed_index; } } } assert(rt_runq->pri_count > 0); uint64_t earliest_deadline = RT_DEADLINE_NONE; uint32_t constraint = RT_CONSTRAINT_NONE; int ed_index = NOPRI; thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links); SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed)); if (--rt_runq->pri_count > 0) { thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); assert(next_rt != THREAD_NULL); earliest_deadline = next_rt->realtime.deadline; constraint = next_rt->realtime.constraint; ed_index = i; } else { bitmap_clear(map, i); } rt_runq->pri_earliest_deadline = earliest_deadline; rt_runq->pri_constraint = constraint; for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) { rt_runq = &rt_run_queue->rt_queue_pri[i]; if (rt_runq->pri_earliest_deadline < earliest_deadline) { earliest_deadline = rt_runq->pri_earliest_deadline; constraint = rt_runq->pri_constraint; ed_index = i; } } os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed); os_atomic_store(&rt_run_queue->constraint, constraint, relaxed); os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed); os_atomic_dec(&rt_run_queue->count, relaxed); thread_clear_runq(new_thread); CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL); return new_thread; } static thread_t rt_runq_first(rt_queue_t rt_run_queue) { bitmap_t *map = rt_run_queue->bitmap; int i = bitmap_first(map, NRTQS); if (i < 0) { return THREAD_NULL; } rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); return next_rt; } static void rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread) { CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread); int pri = thread->sched_pri; assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI)); int i = pri - BASEPRI_RTQUEUES; rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; bitmap_t *map = rt_run_queue->bitmap; assert(rt_runq->pri_count > 0); uint64_t earliest_deadline = RT_DEADLINE_NONE; uint32_t constraint = RT_CONSTRAINT_NONE; int ed_index = NOPRI; remqueue(&thread->runq_links); SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed)); if (--rt_runq->pri_count > 0) { thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); earliest_deadline = next_rt->realtime.deadline; constraint = next_rt->realtime.constraint; ed_index = i; } else { bitmap_clear(map, i); } rt_runq->pri_earliest_deadline = earliest_deadline; rt_runq->pri_constraint = constraint; for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) { rt_runq = &rt_run_queue->rt_queue_pri[i]; if (rt_runq->pri_earliest_deadline < earliest_deadline) { earliest_deadline = rt_runq->pri_earliest_deadline; constraint = rt_runq->pri_constraint; ed_index = i; } } os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed); os_atomic_store(&rt_run_queue->constraint, constraint, relaxed); os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed); os_atomic_dec(&rt_run_queue->count, relaxed); thread_clear_runq_locked(thread); CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL); } rt_queue_t sched_rtlocal_runq(processor_set_t pset) { return &pset->rt_runq; } void sched_rtlocal_init(processor_set_t pset) { pset_rt_init(pset); } void sched_rtlocal_queue_shutdown(processor_t processor) { processor_set_t pset = processor->processor_set; thread_t thread; queue_head_t tqueue; pset_lock(pset); /* We only need to migrate threads if this is the last active or last recommended processor in the pset */ if (bit_count(pset_available_cpumap(pset)) > 0) { pset_unlock(pset); return; } queue_init(&tqueue); while (rt_runq_count(pset) > 0) { thread = rt_runq_dequeue(&pset->rt_runq); enqueue_tail(&tqueue, &thread->runq_links); } sched_update_pset_load_average(pset, 0); pset_update_rt_stealable_state(pset); pset_unlock(pset); qe_foreach_element_safe(thread, &tqueue, runq_links) { remqueue(&thread->runq_links); thread_lock(thread); thread_setrun(thread, SCHED_TAILQ); thread_unlock(thread); } } /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */ void sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context) { thread_t thread; pset_node_t node = &pset_node0; processor_set_t pset = node->psets; spl_t s = splsched(); do { while (pset != NULL) { pset_lock(pset); bitmap_t *map = pset->rt_runq.bitmap; for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) { rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i]; qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) { if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) { scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time; } } } pset_unlock(pset); pset = pset->pset_list; } } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); splx(s); } int64_t sched_rtlocal_runq_count_sum(void) { pset_node_t node = &pset_node0; processor_set_t pset = node->psets; int64_t count = 0; do { while (pset != NULL) { count += pset->rt_runq.runq_stats.count_sum; pset = pset->pset_list; } } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); return count; } /* * Called with stealing_pset locked and * returns with stealing_pset locked * but the lock will have been dropped * if a thread is returned. */ thread_t sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline) { if (!sched_allow_rt_steal) { return THREAD_NULL; } pset_map_t pset_map = stealing_pset->node->pset_map; bit_clear(pset_map, stealing_pset->pset_id); processor_set_t pset = stealing_pset; processor_set_t target_pset; uint64_t target_deadline; retry: target_pset = NULL; target_deadline = earliest_deadline - rt_deadline_epsilon; for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) { processor_set_t nset = pset_array[pset_id]; /* * During startup, while pset_array[] and node->pset_map are still being initialized, * the update to pset_map may become visible to this cpu before the update to pset_array[]. * It would be good to avoid inserting a memory barrier here that is only needed during startup, * so just check nset is not NULL instead. */ if (nset && (nset->stealable_rt_threads_earliest_deadline < target_deadline)) { target_deadline = nset->stealable_rt_threads_earliest_deadline; target_pset = nset; } } if (target_pset != NULL) { pset = change_locked_pset(pset, target_pset); if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) { thread_t new_thread = rt_runq_dequeue(&pset->rt_runq); pset_update_rt_stealable_state(pset); KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0); pset = change_locked_pset(pset, stealing_pset); return new_thread; } pset = change_locked_pset(pset, stealing_pset); earliest_deadline = rt_runq_earliest_deadline(pset); goto retry; } pset = change_locked_pset(pset, stealing_pset); return THREAD_NULL; } /* * pset is locked */ thread_t sched_rt_choose_thread(processor_set_t pset) { processor_t processor = current_processor(); if (SCHED(steal_thread_enabled)(pset)) { do { bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id); if (spill_pending) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2); } thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset)); if (new_thread != THREAD_NULL) { if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3); } return new_thread; } } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)); } if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4); } if (rt_runq_count(pset) > 0) { thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset)); assert(new_thread != THREAD_NULL); pset_update_rt_stealable_state(pset); return new_thread; } return THREAD_NULL; } /* * realtime_queue_insert: * * Enqueue a thread for realtime execution. */ static bool realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread) { pset_assert_locked(pset); bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor); pset_update_rt_stealable_state(pset); return preempt; } /* * realtime_setrun: * * Dispatch a thread for realtime execution. * * Thread must be locked. Associated pset must * be locked, and is returned unlocked. */ static void realtime_setrun( processor_t chosen_processor, thread_t thread) { processor_set_t pset = chosen_processor->processor_set; pset_assert_locked(pset); bool pset_is_locked = true; int n_backup = 0; if (thread->realtime.constraint <= rt_constraint_threshold) { n_backup = sched_rt_n_backup_processors; } assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS)); int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset); if (existing_backups > 0) { n_backup = n_backup - existing_backups; if (n_backup < 0) { n_backup = 0; } } sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {}; processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {}; thread->chosen_processor = chosen_processor; /* */ assert(thread->bound_processor == PROCESSOR_NULL); realtime_queue_insert(chosen_processor, pset, thread); processor_t processor = chosen_processor; int count = 0; for (int i = 0; i <= n_backup; i++) { if (i == 0) { ipi_type[i] = SCHED_IPI_NONE; ipi_processor[i] = processor; count++; ast_t preempt = AST_NONE; if (thread->sched_pri > processor->current_pri) { preempt = (AST_PREEMPT | AST_URGENT); } else if (thread->sched_pri == processor->current_pri) { if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) { preempt = (AST_PREEMPT | AST_URGENT); } } if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { if (processor == current_processor()) { pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); ast_on(preempt); if ((preempt & AST_URGENT) == AST_URGENT) { if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1); } } if ((preempt & AST_PREEMPT) == AST_PREEMPT) { bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id); } } else { ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT); } } else if (processor->state == PROCESSOR_DISPATCHING) { if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2); } } else { if (processor == current_processor()) { ast_on(preempt); if ((preempt & AST_URGENT) == AST_URGENT) { if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3); } } if ((preempt & AST_PREEMPT) == AST_PREEMPT) { bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id); } } else { ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT); } } } else { /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */ } } else { if (!pset_is_locked) { pset_lock(pset); } ipi_type[i] = SCHED_IPI_NONE; ipi_processor[i] = PROCESSOR_NULL; pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]); if (ipi_processor[i] == PROCESSOR_NULL) { break; } count++; KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE, ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1); #if defined(__x86_64__) #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0))) if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) { processor_t p0 = ipi_processor[0]; processor_t p1 = ipi_processor[1]; assert(p0 && p1); if (p_is_good(p0) && p_is_good(p1)) { /* * Both the chosen processor and the first backup are non-cpu0 primaries, * so there is no need for a 2nd backup processor. */ break; } } #endif } } if (pset_is_locked) { pset_unlock(pset); } assert((count > 0) && (count <= (n_backup + 1))); for (int i = 0; i < count; i++) { assert(ipi_processor[i] != PROCESSOR_NULL); sched_ipi_perform(ipi_processor[i], ipi_type[i]); } } sched_ipi_type_t sched_ipi_deferred_policy(processor_set_t pset, processor_t dst, thread_t thread, __unused sched_ipi_event_t event) { #if defined(CONFIG_SCHED_DEFERRED_AST) #if CONFIG_THREAD_GROUPS if (thread) { struct thread_group *tg = thread_group_get(thread); if (thread_group_uses_immediate_ipi(tg)) { return SCHED_IPI_IMMEDIATE; } } #endif /* CONFIG_THREAD_GROUPS */ if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) { return SCHED_IPI_DEFERRED; } #else /* CONFIG_SCHED_DEFERRED_AST */ (void) thread; panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id); #endif /* CONFIG_SCHED_DEFERRED_AST */ return SCHED_IPI_NONE; } sched_ipi_type_t sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event) { sched_ipi_type_t ipi_type = SCHED_IPI_NONE; assert(dst != NULL); processor_set_t pset = dst->processor_set; if (current_processor() == dst) { return SCHED_IPI_NONE; } bool dst_idle = (dst->state == PROCESSOR_IDLE); if (dst_idle) { pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING); } ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event); switch (ipi_type) { case SCHED_IPI_NONE: return SCHED_IPI_NONE; #if defined(CONFIG_SCHED_DEFERRED_AST) case SCHED_IPI_DEFERRED: bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id); break; #endif /* CONFIG_SCHED_DEFERRED_AST */ default: if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START, dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4); } bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id); break; } return ipi_type; } sched_ipi_type_t sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event) { sched_ipi_type_t ipi_type = SCHED_IPI_NONE; boolean_t deferred_ipi_supported = false; processor_set_t pset = dst->processor_set; #if defined(CONFIG_SCHED_DEFERRED_AST) deferred_ipi_supported = true; #endif /* CONFIG_SCHED_DEFERRED_AST */ switch (event) { case SCHED_IPI_EVENT_SPILL: case SCHED_IPI_EVENT_SMT_REBAL: case SCHED_IPI_EVENT_REBALANCE: case SCHED_IPI_EVENT_BOUND_THR: case SCHED_IPI_EVENT_RT_PREEMPT: /* * The RT preempt, spill, SMT rebalance, rebalance and the bound thread * scenarios use immediate IPIs always. */ ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; break; case SCHED_IPI_EVENT_PREEMPT: /* In the preemption case, use immediate IPIs for RT threads */ if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) { ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; break; } /* * For Non-RT threads preemption, * If the core is active, use immediate IPIs. * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI. */ if (deferred_ipi_supported && dst_idle) { return sched_ipi_deferred_policy(pset, dst, thread, event); } ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; break; default: panic("Unrecognized scheduler IPI event type %d", event); } assert(ipi_type != SCHED_IPI_NONE); return ipi_type; } void sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi) { switch (ipi) { case SCHED_IPI_NONE: break; case SCHED_IPI_IDLE: machine_signal_idle(dst); break; case SCHED_IPI_IMMEDIATE: cause_ast_check(dst); break; case SCHED_IPI_DEFERRED: machine_signal_idle_deferred(dst); break; default: panic("Unrecognized scheduler IPI type: %d", ipi); } } #if defined(CONFIG_SCHED_TIMESHARE_CORE) boolean_t priority_is_urgent(int priority) { return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE; } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ /* * processor_setrun: * * Dispatch a thread for execution on a * processor. * * Thread must be locked. Associated pset must * be locked, and is returned unlocked. */ static void processor_setrun( processor_t processor, thread_t thread, integer_t options) { processor_set_t pset = processor->processor_set; pset_assert_locked(pset); ast_t preempt = AST_NONE; enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing; sched_ipi_type_t ipi_type = SCHED_IPI_NONE; thread->chosen_processor = processor; /* * Set preemption mode. */ #if defined(CONFIG_SCHED_DEFERRED_AST) /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */ #endif if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) { preempt = (AST_PREEMPT | AST_URGENT); } else if (processor->current_is_eagerpreempt) { preempt = (AST_PREEMPT | AST_URGENT); } else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) { if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) { preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE; } else { preempt = AST_NONE; } } else { preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE; } if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) { /* * Having gone to the trouble of forcing this thread off a less preferred core, * we should force the preferable core to reschedule immediately to give this * thread a chance to run instead of just sitting on the run queue where * it may just be stolen back by the idle core we just forced it off. */ preempt |= AST_PREEMPT; } SCHED(processor_enqueue)(processor, thread, options); sched_update_pset_load_average(pset, 0); if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { ipi_action = eExitIdle; } else if (processor->state == PROCESSOR_DISPATCHING) { if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5); } } else if (processor->state == PROCESSOR_RUNNING && (thread->sched_pri >= processor->current_pri)) { ipi_action = eInterruptRunning; } } else { /* * New thread is not important enough to preempt what is running, but * special processor states may need special handling */ if (processor->state == PROCESSOR_IDLE) { ipi_action = eExitIdle; } else if (processor->state == PROCESSOR_DISPATCHING) { if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6); } } } if (ipi_action != eDoNothing) { if (processor == current_processor()) { if (ipi_action == eExitIdle) { pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); } if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) { ast_on(preempt); } if ((preempt & AST_URGENT) == AST_URGENT) { if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7); } } else { if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7); } } if ((preempt & AST_PREEMPT) == AST_PREEMPT) { bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id); } else { bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id); } } else { sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT; ipi_type = sched_ipi_action(processor, thread, event); } } pset_unlock(pset); sched_ipi_perform(processor, ipi_type); if (ipi_action != eDoNothing && processor == current_processor()) { ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt); ast_on(new_preempt); } } /* * choose_next_pset: * * Return the next sibling pset containing * available processors. * * Returns the original pset if none other is * suitable. */ static processor_set_t choose_next_pset( processor_set_t pset) { processor_set_t nset = pset; do { nset = next_pset(nset); /* * Sometimes during startup the pset_map can contain a bit * for a pset that isn't fully published in pset_array because * the pset_map read isn't an acquire load. * * In order to avoid needing an acquire barrier here, just bail * out. */ if (nset == PROCESSOR_SET_NULL) { return pset; } } while (nset->online_processor_count < 1 && nset != pset); return nset; } /* * choose_processor: * * Choose a processor for the thread, beginning at * the pset. Accepts an optional processor hint in * the pset. * * Returns a processor, possibly from a different pset. * * The thread must be locked. The pset must be locked, * and the resulting pset is locked on return. */ processor_t choose_processor( processor_set_t starting_pset, processor_t processor, thread_t thread) { processor_set_t pset = starting_pset; processor_set_t nset; assert(thread->sched_pri <= MAXPRI); /* * Prefer the hinted processor, when appropriate. */ /* Fold last processor hint from secondary processor to its primary */ if (processor != PROCESSOR_NULL) { processor = processor->processor_primary; } /* * Only consult platform layer if pset is active, which * it may not be in some cases when a multi-set system * is going to sleep. */ if (pset->online_processor_count) { if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) { processor_t mc_processor = machine_choose_processor(pset, processor); if (mc_processor != PROCESSOR_NULL) { processor = mc_processor->processor_primary; } } } /* * At this point, we may have a processor hint, and we may have * an initial starting pset. If the hint is not in the pset, or * if the hint is for a processor in an invalid state, discard * the hint. */ if (processor != PROCESSOR_NULL) { if (processor->processor_set != pset) { processor = PROCESSOR_NULL; } else if (!processor->is_recommended) { processor = PROCESSOR_NULL; } else { switch (processor->state) { case PROCESSOR_START: case PROCESSOR_PENDING_OFFLINE: case PROCESSOR_OFF_LINE: /* * Hint is for a processor that cannot support running new threads. */ processor = PROCESSOR_NULL; break; case PROCESSOR_IDLE: /* * Hint is for an idle processor. Assume it is no worse than any other * idle processor. The platform layer had an opportunity to provide * the "least cost idle" processor above. */ if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) { uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask); uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask; /* * If the rotation bitmask to force a migration is set for this core and there's an idle core that * that needn't be avoided, don't continue running on the same core. */ if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_primary_map != 0)) { return processor; } } processor = PROCESSOR_NULL; break; case PROCESSOR_RUNNING: case PROCESSOR_DISPATCHING: /* * Hint is for an active CPU. This fast-path allows * realtime threads to preempt non-realtime threads * to regain their previous executing processor. */ if (thread->sched_pri >= BASEPRI_RTQUEUES) { if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) { return processor; } processor = PROCESSOR_NULL; } /* Otherwise, use hint as part of search below */ break; default: processor = PROCESSOR_NULL; break; } } } /* * Iterate through the processor sets to locate * an appropriate processor. Seed results with * a last-processor hint, if available, so that * a search must find something strictly better * to replace it. * * A primary/secondary pair of SMT processors are * "unpaired" if the primary is busy but its * corresponding secondary is idle (so the physical * core has full use of its resources). */ integer_t lowest_priority = MAXPRI + 1; integer_t lowest_secondary_priority = MAXPRI + 1; integer_t lowest_unpaired_primary_priority = MAXPRI + 1; integer_t lowest_idle_secondary_priority = MAXPRI + 1; integer_t lowest_count = INT_MAX; processor_t lp_processor = PROCESSOR_NULL; processor_t lp_unpaired_primary_processor = PROCESSOR_NULL; processor_t lp_idle_secondary_processor = PROCESSOR_NULL; processor_t lp_paired_secondary_processor = PROCESSOR_NULL; processor_t lc_processor = PROCESSOR_NULL; if (processor != PROCESSOR_NULL) { /* All other states should be enumerated above. */ assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING); assert(thread->sched_pri < BASEPRI_RTQUEUES); lowest_priority = processor->current_pri; lp_processor = processor; lowest_count = SCHED(processor_runq_count)(processor); lc_processor = processor; } if (thread->sched_pri >= BASEPRI_RTQUEUES) { pset_node_t node = pset->node; bool include_ast_urgent_pending_cpus = false; cpumap_t ast_urgent_pending; try_again: ast_urgent_pending = 0; int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus; for (; consider_secondaries < 2; consider_secondaries++) { pset = change_locked_pset(pset, starting_pset); do { cpumap_t available_map = pset_available_cpumap(pset); if (available_map == 0) { goto no_available_cpus; } processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false); if (processor) { return processor; } if (consider_secondaries) { processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus); if (processor) { /* * Instead of looping through all the psets to find the global * furthest deadline processor, preempt the first candidate found. * The preempted thread will then find any other available far deadline * processors to preempt. */ return processor; } ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask; if (rt_runq_count(pset) < lowest_count) { int cpuid = bit_first(available_map); assert(cpuid >= 0); lc_processor = processor_array[cpuid]; lowest_count = rt_runq_count(pset); } } no_available_cpus: nset = next_pset(pset); if (nset != starting_pset) { pset = change_locked_pset(pset, nset); } } while (nset != starting_pset); } /* Short cut for single pset nodes */ if (bit_count(node->pset_map) == 1) { if (lc_processor) { pset_assert_locked(lc_processor->processor_set); return lc_processor; } } else { if (ast_urgent_pending && !include_ast_urgent_pending_cpus) { /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */ include_ast_urgent_pending_cpus = true; goto try_again; } } processor = lc_processor; if (processor) { pset = change_locked_pset(pset, processor->processor_set); /* Check that chosen processor is still usable */ cpumap_t available_map = pset_available_cpumap(pset); if (bit_test(available_map, processor->cpu_id)) { return processor; } /* processor is no longer usable */ processor = PROCESSOR_NULL; } pset_assert_locked(pset); pset_unlock(pset); return PROCESSOR_NULL; } /* No realtime threads from this point on */ assert(thread->sched_pri < BASEPRI_RTQUEUES); do { /* * Choose an idle processor, in pset traversal order */ uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask); uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask; /* there shouldn't be a pending AST if the processor is idle */ assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0); /* * Look at the preferred cores first. */ int cpuid = lsb_next(preferred_idle_primary_map, pset->cpu_preferred_last_chosen); if (cpuid < 0) { cpuid = lsb_first(preferred_idle_primary_map); } if (cpuid >= 0) { processor = processor_array[cpuid]; pset->cpu_preferred_last_chosen = cpuid; return processor; } /* * Look at the cores that don't need to be avoided next. */ if (pset->perfcontrol_cpu_migration_bitmask != 0) { uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask; cpuid = lsb_next(non_avoided_idle_primary_map, pset->cpu_preferred_last_chosen); if (cpuid < 0) { cpuid = lsb_first(non_avoided_idle_primary_map); } if (cpuid >= 0) { processor = processor_array[cpuid]; pset->cpu_preferred_last_chosen = cpuid; return processor; } } /* * Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available. */ cpuid = lsb_first(idle_primary_map); if (cpuid >= 0) { processor = processor_array[cpuid]; return processor; } /* * Otherwise, enumerate active and idle processors to find primary candidates * with lower priority/etc. */ uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) & pset->recommended_bitmask & ~pset->pending_AST_URGENT_cpu_mask); if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) { active_map &= ~pset->pending_AST_PREEMPT_cpu_mask; } active_map = bit_ror64(active_map, (pset->last_chosen + 1)); for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) { cpuid = ((rotid + pset->last_chosen + 1) & 63); processor = processor_array[cpuid]; integer_t cpri = processor->current_pri; processor_t primary = processor->processor_primary; if (primary != processor) { /* If primary is running a NO_SMT thread, don't choose its secondary */ if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) { if (cpri < lowest_secondary_priority) { lowest_secondary_priority = cpri; lp_paired_secondary_processor = processor; } } } else { if (cpri < lowest_priority) { lowest_priority = cpri; lp_processor = processor; } } integer_t ccount = SCHED(processor_runq_count)(processor); if (ccount < lowest_count) { lowest_count = ccount; lc_processor = processor; } } /* * For SMT configs, these idle secondary processors must have active primary. Otherwise * the idle primary would have short-circuited the loop above */ uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & ~pset->primary_map & pset->recommended_bitmask); /* there shouldn't be a pending AST if the processor is idle */ assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0); assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0); for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) { processor = processor_array[cpuid]; processor_t cprimary = processor->processor_primary; integer_t primary_pri = cprimary->current_pri; /* * TODO: This should also make the same decisions * as secondary_can_run_realtime_thread * * TODO: Keep track of the pending preemption priority * of the primary to make this more accurate. */ /* If the primary is running a no-smt thread, then don't choose its secondary */ if (cprimary->state == PROCESSOR_RUNNING && processor_active_thread_no_smt(cprimary)) { continue; } /* * Find the idle secondary processor with the lowest priority primary * * We will choose this processor as a fallback if we find no better * primary to preempt. */ if (primary_pri < lowest_idle_secondary_priority) { lp_idle_secondary_processor = processor; lowest_idle_secondary_priority = primary_pri; } /* Find the the lowest priority active primary with idle secondary */ if (primary_pri < lowest_unpaired_primary_priority) { /* If the primary processor is offline or starting up, it's not a candidate for this path */ if (cprimary->state != PROCESSOR_RUNNING && cprimary->state != PROCESSOR_DISPATCHING) { continue; } if (!cprimary->is_recommended) { continue; } /* if the primary is pending preemption, don't try to re-preempt it */ if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) { continue; } if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE && bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) { continue; } lowest_unpaired_primary_priority = primary_pri; lp_unpaired_primary_processor = cprimary; } } /* * We prefer preempting a primary processor over waking up its secondary. * The secondary will then be woken up by the preempted thread. */ if (thread->sched_pri > lowest_unpaired_primary_priority) { pset->last_chosen = lp_unpaired_primary_processor->cpu_id; return lp_unpaired_primary_processor; } /* * We prefer preempting a lower priority active processor over directly * waking up an idle secondary. * The preempted thread will then find the idle secondary. */ if (thread->sched_pri > lowest_priority) { pset->last_chosen = lp_processor->cpu_id; return lp_processor; } /* * lc_processor is used to indicate the best processor set run queue * on which to enqueue a thread when all available CPUs are busy with * higher priority threads, so try to make sure it is initialized. */ if (lc_processor == PROCESSOR_NULL) { cpumap_t available_map = pset_available_cpumap(pset); cpuid = lsb_first(available_map); if (cpuid >= 0) { lc_processor = processor_array[cpuid]; lowest_count = SCHED(processor_runq_count)(lc_processor); } } /* * Move onto the next processor set. * * If all primary processors in this pset are running a higher * priority thread, move on to next pset. Only when we have * exhausted the search for primary processors do we * fall back to secondaries. */ #if CONFIG_SCHED_EDGE /* * The edge scheduler expects a CPU to be selected from the pset it passed in * as the starting pset for non-RT workloads. The edge migration algorithm * should already have considered idle CPUs and loads to decide the starting_pset; * which means that this loop can be short-circuted. */ nset = starting_pset; #else /* CONFIG_SCHED_EDGE */ nset = next_pset(pset); #endif /* CONFIG_SCHED_EDGE */ if (nset != starting_pset) { pset = change_locked_pset(pset, nset); } } while (nset != starting_pset); /* * Make sure that we pick a running processor, * and that the correct processor set is locked. * Since we may have unlocked the candidate processor's * pset, it may have changed state. * * All primary processors are running a higher priority * thread, so the only options left are enqueuing on * the secondary processor that would perturb the least priority * primary, or the least busy primary. */ /* lowest_priority is evaluated in the main loops above */ if (lp_idle_secondary_processor != PROCESSOR_NULL) { processor = lp_idle_secondary_processor; } else if (lp_paired_secondary_processor != PROCESSOR_NULL) { processor = lp_paired_secondary_processor; } else if (lc_processor != PROCESSOR_NULL) { processor = lc_processor; } else { processor = PROCESSOR_NULL; } if (processor) { pset = change_locked_pset(pset, processor->processor_set); /* Check that chosen processor is still usable */ cpumap_t available_map = pset_available_cpumap(pset); if (bit_test(available_map, processor->cpu_id)) { pset->last_chosen = processor->cpu_id; return processor; } /* processor is no longer usable */ processor = PROCESSOR_NULL; } pset_assert_locked(pset); pset_unlock(pset); return PROCESSOR_NULL; } /* * Default implementation of SCHED(choose_node)() * for single node systems */ pset_node_t sched_choose_node(__unused thread_t thread) { return &pset_node0; } /* * choose_starting_pset: * * Choose a starting processor set for the thread. * May return a processor hint within the pset. * * Returns a starting processor set, to be used by * choose_processor. * * The thread must be locked. The resulting pset is unlocked on return, * and is chosen without taking any pset locks. */ processor_set_t choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint) { processor_set_t pset; processor_t processor = PROCESSOR_NULL; if (thread->affinity_set != AFFINITY_SET_NULL) { /* * Use affinity set policy hint. */ pset = thread->affinity_set->aset_pset; } else if (thread->last_processor != PROCESSOR_NULL) { /* * Simple (last processor) affinity case. */ processor = thread->last_processor; pset = processor->processor_set; } else { /* * No Affinity case: * * Utilitize a per task hint to spread threads * among the available processor sets. * NRG this seems like the wrong thing to do. * See also task->pset_hint = pset in thread_setrun() */ pset = get_threadtask(thread)->pset_hint; if (pset == PROCESSOR_SET_NULL) { pset = current_processor()->processor_set; } pset = choose_next_pset(pset); } if (!bit_test(node->pset_map, pset->pset_id)) { /* pset is not from this node so choose one that is */ int id = lsb_first(node->pset_map); if (id < 0) { /* startup race, so check again under the node lock */ lck_spin_lock(&pset_node_lock); if (bit_test(node->pset_map, pset->pset_id)) { id = pset->pset_id; } else { id = lsb_first(node->pset_map); } lck_spin_unlock(&pset_node_lock); } assert(id >= 0); pset = pset_array[id]; } if (bit_count(node->pset_map) == 1) { /* Only a single pset in this node */ goto out; } bool avoid_cpu0 = false; #if defined(__x86_64__) if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) { /* Avoid the pset containing cpu0 */ avoid_cpu0 = true; /* Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ */ assert(bit_test(pset_array[0]->cpu_bitmask, 0)); } #endif if (thread->sched_pri >= BASEPRI_RTQUEUES) { pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map); if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) { if (avoid_cpu0) { rt_target_map = bit_ror64(rt_target_map, 1); } int rotid = lsb_first(rt_target_map); if (rotid >= 0) { int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid; pset = pset_array[id]; goto out; } } if (!pset->is_SMT || !sched_allow_rt_smt) { /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */ goto out; } rt_target_map = atomic_load(&node->pset_non_rt_map); if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) { if (avoid_cpu0) { rt_target_map = bit_ror64(rt_target_map, 1); } int rotid = lsb_first(rt_target_map); if (rotid >= 0) { int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid; pset = pset_array[id]; goto out; } } /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */ } else { pset_map_t idle_map = atomic_load(&node->pset_idle_map); if (!bit_test(idle_map, pset->pset_id)) { int next_idle_pset_id = lsb_first(idle_map); if (next_idle_pset_id >= 0) { pset = pset_array[next_idle_pset_id]; } } } out: if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) { processor = PROCESSOR_NULL; } if (processor != PROCESSOR_NULL) { *processor_hint = processor; } assert(pset != NULL); return pset; } /* * thread_setrun: * * Dispatch thread for execution, onto an idle * processor or run queue, and signal a preemption * as appropriate. * * Thread must be locked. */ void thread_setrun( thread_t thread, sched_options_t options) { processor_t processor = PROCESSOR_NULL; processor_set_t pset; assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN); thread_assert_runq_null(thread); #if CONFIG_PREADOPT_TG /* We know that the thread is not in the runq by virtue of being in this * function and the thread is not self since we are running. We can safely * resolve the thread group hierarchy and modify the thread's thread group * here. */ thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread); #endif /* * Update priority if needed. */ if (SCHED(can_update_priority)(thread)) { SCHED(update_priority)(thread); } thread->sfi_class = sfi_thread_classify(thread); if (thread->bound_processor == PROCESSOR_NULL) { /* * Unbound case. * * Usually, this loop will only be executed once, * but if CLPC derecommends a processor after it has been chosen, * or if a processor is shut down after it is chosen, * choose_processor() may return NULL, so a retry * may be necessary. A single retry will usually * be enough, and we can't afford to retry too many times * because interrupts are disabled. */ #define CHOOSE_PROCESSOR_MAX_RETRIES 3 for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) { processor_t processor_hint = PROCESSOR_NULL; pset_node_t node = SCHED(choose_node)(thread); processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint); pset_lock(starting_pset); processor = SCHED(choose_processor)(starting_pset, processor_hint, thread); if (processor != PROCESSOR_NULL) { pset = processor->processor_set; pset_assert_locked(pset); break; } } /* * If choose_processor() still returns NULL, * which is very unlikely, we need a fallback. */ if (processor == PROCESSOR_NULL) { bool unlock_available_cores_lock = false; if (sched_all_cpus_offline()) { /* * There are no available processors * because we're in final system shutdown. * Enqueue on the master processor and we'll * handle it when it powers back up. */ processor = master_processor; } else if (support_bootcpu_shutdown) { /* * Grab the sched_available_cores_lock to select * some available processor and prevent it from * becoming offline while we enqueue the thread. * * This is very close to a lock inversion, but * places that do call thread_setrun with this * lock held know that the current cpu will be * schedulable, so we won't fall out of * choose_processor. */ simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); unlock_available_cores_lock = true; int last_resort_cpu = sched_last_resort_cpu(); processor = processor_array[last_resort_cpu]; } else { /* * The master processor is never shut down, always safe to choose. */ processor = master_processor; } pset = processor->processor_set; pset_lock(pset); assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended)); if (unlock_available_cores_lock) { simple_unlock(&sched_available_cores_lock); } } task_t task = get_threadtask(thread); if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) { task->pset_hint = pset; /* NRG this is done without holding the task lock */ } SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0); assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended)); } else { /* * Bound case: * * Unconditionally dispatch on the processor. */ processor = thread->bound_processor; pset = processor->processor_set; pset_lock(pset); SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0); } /* * Dispatch the thread on the chosen processor. * TODO: This should be based on sched_mode, not sched_pri */ if (thread->sched_pri >= BASEPRI_RTQUEUES) { realtime_setrun(processor, thread); } else { processor_setrun(processor, thread, options); } /* pset is now unlocked */ if (thread->bound_processor == PROCESSOR_NULL) { SCHED(check_spill)(pset, thread); } } processor_set_t task_choose_pset( task_t task) { processor_set_t pset = task->pset_hint; if (pset != PROCESSOR_SET_NULL) { pset = choose_next_pset(pset); } return pset; } /* * Check for a preemption point in * the current context. * * Called at splsched with thread locked. */ ast_t csw_check( thread_t thread, processor_t processor, ast_t check_reason) { processor_set_t pset = processor->processor_set; assert(thread == processor->active_thread); pset_lock(pset); processor_state_update_from_thread(processor, thread, true); ast_t preempt = csw_check_locked(thread, processor, pset, check_reason); /* Acknowledge the IPI if we decided not to preempt */ if ((preempt & AST_URGENT) == 0) { if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8); } } if ((preempt & AST_PREEMPT) == 0) { bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id); } pset_unlock(pset); return update_pending_nonurgent_preemption(processor, preempt); } void clear_pending_nonurgent_preemption(processor_t processor) { if (!processor->pending_nonurgent_preemption) { return; } KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_END); processor->pending_nonurgent_preemption = false; running_timer_clear(processor, RUNNING_TIMER_PREEMPT); } ast_t update_pending_nonurgent_preemption(processor_t processor, ast_t reason) { if ((reason & (AST_URGENT | AST_PREEMPT)) != (AST_PREEMPT)) { clear_pending_nonurgent_preemption(processor); return reason; } if (nonurgent_preemption_timer_abs == 0) { /* Preemption timer not enabled */ return reason; } if (current_thread()->state & TH_IDLE) { /* idle threads don't need nonurgent preemption */ return reason; } if (processor->pending_nonurgent_preemption) { /* Timer is already armed, no need to do it again */ return reason; } if (ml_did_interrupt_userspace()) { /* * We're preempting userspace here, so we don't need * to defer the preemption. Force AST_URGENT * so that we can avoid arming this timer without risking * ast_taken_user deciding to spend too long in kernel * space to handle other ASTs. */ return reason | AST_URGENT; } /* * We've decided to do a nonurgent preemption when running in * kernelspace. We defer the preemption until reaching userspace boundary * to give a grace period for locks etc to be dropped and to reach * a clean preemption point, so that the preempting thread doesn't * always immediately hit the lock that the waking thread still holds. * * Arm a timer to enforce that the preemption executes within a bounded * time if the thread doesn't block or return to userspace quickly. */ processor->pending_nonurgent_preemption = true; KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_START, reason); uint64_t now = mach_absolute_time(); uint64_t deadline = now + nonurgent_preemption_timer_abs; running_timer_enter(processor, RUNNING_TIMER_PREEMPT, NULL, now, deadline); return reason; } /* * Check for preemption at splsched with * pset locked and processor as the current * processor. */ ast_t csw_check_locked( thread_t thread, processor_t processor, processor_set_t pset, ast_t check_reason) { assert(processor == current_processor()); /* * If the current thread is running on a processor that is no longer recommended, * urgently preempt it, at which point thread_select() should * try to idle the processor and re-dispatch the thread to a recommended processor. */ if (!processor->is_recommended) { return check_reason | AST_PREEMPT | AST_URGENT; } if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { return check_reason | AST_PREEMPT | AST_URGENT; } if (rt_runq_count(pset) > 0) { if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) { return check_reason | AST_PREEMPT | AST_URGENT; } else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) { return check_reason | AST_PREEMPT | AST_URGENT; } else { return check_reason | AST_PREEMPT; } } ast_t result = SCHED(processor_csw_check)(processor); if (result != AST_NONE) { return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE); } /* * Same for avoid-processor * * TODO: Should these set AST_REBALANCE? */ if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, check_reason)) { return check_reason | AST_PREEMPT; } /* * Even though we could continue executing on this processor, a * secondary SMT core should try to shed load to another primary core. * * TODO: Should this do the same check that thread_select does? i.e. * if no bound threads target this processor, and idle primaries exist, preempt * The case of RT threads existing is already taken care of above */ if (processor->current_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor) { return check_reason | AST_PREEMPT; } if (thread->state & TH_SUSP) { return check_reason | AST_PREEMPT; } #if CONFIG_SCHED_SFI /* * Current thread may not need to be preempted, but maybe needs * an SFI wait? */ result = sfi_thread_needs_ast(thread, NULL); if (result != AST_NONE) { return result; } #endif return AST_NONE; } /* * Handle preemption IPI or IPI in response to setting an AST flag * Triggered by cause_ast_check * Called at splsched */ void ast_check(processor_t processor) { smr_ack_ipi(); if (processor->state != PROCESSOR_RUNNING) { return; } SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_AST_CHECK) | DBG_FUNC_START); thread_t thread = processor->active_thread; assert(thread == current_thread()); /* * Pairs with task_restartable_ranges_synchronize */ thread_lock(thread); thread_reset_pcs_ack_IPI(thread); /* * Propagate thread ast to processor. * (handles IPI in response to setting AST flag) */ ast_propagate(thread); /* * Stash the old urgency and perfctl values to find out if * csw_check updates them. */ thread_urgency_t old_urgency = processor->current_urgency; perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class; ast_t preempt; if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) { ast_on(preempt); } if (old_urgency != processor->current_urgency) { /* * Urgency updates happen with the thread lock held (ugh). * TODO: This doesn't notice QoS changes... */ uint64_t urgency_param1, urgency_param2; thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2); thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread); } thread_unlock(thread); if (old_perfctl_class != processor->current_perfctl_class) { /* * We updated the perfctl class of this thread from another core. * Let CLPC know that the currently running thread has a new * class. */ machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE, mach_approximate_time(), 0, thread); } SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_AST_CHECK) | DBG_FUNC_END, preempt); } void thread_preempt_expire( timer_call_param_t p0, __unused timer_call_param_t p1) { processor_t processor = p0; assert(processor == current_processor()); assert(p1 == NULL); thread_t thread = current_thread(); /* * This is set and cleared by the current core, so we will * never see a race with running timer expiration */ assert(processor->pending_nonurgent_preemption); clear_pending_nonurgent_preemption(processor); thread_lock(thread); /* * Check again to see if it's still worth a * context switch, but this time force enable kernel preemption */ ast_t preempt = csw_check(thread, processor, AST_URGENT); if (preempt) { ast_on(preempt); } thread_unlock(thread); KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt); } /* * set_sched_pri: * * Set the scheduled priority of the specified thread. * * This may cause the thread to change queues. * * Thread must be locked. */ void set_sched_pri( thread_t thread, int16_t new_priority, set_sched_pri_options_t options) { bool is_current_thread = (thread == current_thread()); bool removed_from_runq = false; bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY); int16_t old_priority = thread->sched_pri; /* If we're already at this priority, no need to mess with the runqueue */ if (new_priority == old_priority) { #if CONFIG_SCHED_CLUTCH /* For the first thread in the system, the priority is correct but * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch * scheduler relies on the bucket being set for all threads, update * its bucket here. */ if (thread->th_sched_bucket == TH_BUCKET_RUN) { assert(thread == vm_pageout_scan_thread); SCHED(update_thread_bucket)(thread); } #endif /* CONFIG_SCHED_CLUTCH */ return; } if (is_current_thread) { assert(thread->state & TH_RUN); thread_assert_runq_null(thread); } else { removed_from_runq = thread_run_queue_remove(thread); } thread->sched_pri = new_priority; #if CONFIG_SCHED_CLUTCH /* * Since for the clutch scheduler, the thread's bucket determines its runq * in the hierarchy it is important to update the bucket when the thread * lock is held and the thread has been removed from the runq hierarchy. */ SCHED(update_thread_bucket)(thread); #endif /* CONFIG_SCHED_CLUTCH */ KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), (uintptr_t)thread_tid(thread), thread->base_pri, thread->sched_pri, thread->sched_usage, 0); if (removed_from_runq) { thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ); } else if (is_current_thread) { processor_t processor = thread->last_processor; assert(processor == current_processor()); thread_urgency_t old_urgency = processor->current_urgency; /* * When dropping in priority, check if the thread no longer belongs on core. * If a thread raises its own priority, don't aggressively rebalance it. * * * csw_check does a processor_state_update_from_thread, but * we should do our own if we're being lazy. */ if (!lazy_update && new_priority < old_priority) { ast_t preempt; if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) { ast_on(preempt); } } else { processor_state_update_from_thread(processor, thread, false); } /* * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS * class alterations from user space to occur relatively infrequently, hence * those are lazily handled. QoS classes have distinct priority bands, and QoS * inheritance is expected to involve priority changes. */ if (processor->current_urgency != old_urgency) { uint64_t urgency_param1, urgency_param2; thread_urgency_t new_urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2); thread_tell_urgency(new_urgency, urgency_param1, urgency_param2, 0, thread); } /* TODO: only call this if current_perfctl_class changed */ uint64_t ctime = mach_approximate_time(); machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime); } else if (thread->state & TH_RUN) { processor_t processor = thread->last_processor; if (!lazy_update && processor != PROCESSOR_NULL && processor != current_processor() && processor->active_thread == thread) { cause_ast_check(processor); } } } /* * thread_run_queue_remove_for_handoff * * Pull a thread or its (recursive) push target out of the runqueue * so that it is ready for thread_run() * * Called at splsched * * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled. * This may be different than the thread that was passed in. */ thread_t thread_run_queue_remove_for_handoff(thread_t thread) { thread_t pulled_thread = THREAD_NULL; thread_lock(thread); /* * Check that the thread is not bound to a different processor, * NO_SMT flag is not set on the thread, cluster type of * processor matches with thread if the thread is pinned to a * particular cluster and that realtime is not involved. * * Next, pull it off its run queue. If it doesn't come, it's not eligible. */ processor_t processor = current_processor(); if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) && (!thread_no_smt(thread)) && (processor->current_pri < BASEPRI_RTQUEUES) && (thread->sched_pri < BASEPRI_RTQUEUES) #if __AMP__ && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) || processor->processor_set->pset_id == thread->th_bound_cluster_id) #endif /* __AMP__ */ ) { if (thread_run_queue_remove(thread)) { pulled_thread = thread; } } thread_unlock(thread); return pulled_thread; } /* * thread_prepare_for_handoff * * Make the thread ready for handoff. * If the thread was runnable then pull it off the runq, if the thread could * not be pulled, return NULL. * * If the thread was woken up from wait for handoff, make sure it is not bound to * different processor. * * Called at splsched * * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled. * This may be different than the thread that was passed in. */ thread_t thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option) { thread_t pulled_thread = THREAD_NULL; if (option & THREAD_HANDOFF_SETRUN_NEEDED) { processor_t processor = current_processor(); thread_lock(thread); /* * Check that the thread is not bound to a different processor, * NO_SMT flag is not set on the thread and cluster type of * processor matches with thread if the thread is pinned to a * particular cluster. Call setrun instead if above conditions * are not satisfied. */ if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) && (!thread_no_smt(thread)) #if __AMP__ && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) || processor->processor_set->pset_id == thread->th_bound_cluster_id) #endif /* __AMP__ */ ) { pulled_thread = thread; } else { thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); } thread_unlock(thread); } else { pulled_thread = thread_run_queue_remove_for_handoff(thread); } return pulled_thread; } /* * thread_run_queue_remove: * * Remove a thread from its current run queue and * return TRUE if successful. * * Thread must be locked. * * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the * run queues because the caller locked the thread. Otherwise * the thread is on a run queue, but could be chosen for dispatch * and removed by another processor under a different lock, which * will set thread->runq to PROCESSOR_NULL. * * Hence the thread select path must not rely on anything that could * be changed under the thread lock after calling this function, * most importantly thread->sched_pri. */ boolean_t thread_run_queue_remove( thread_t thread) { boolean_t removed = FALSE; if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) { /* Thread isn't runnable */ thread_assert_runq_null(thread); return FALSE; } processor_t processor = thread_get_runq(thread); if (processor == PROCESSOR_NULL) { /* * The thread is either not on the runq, * or is in the midst of being removed from the runq. * * runq is set to NULL under the pset lock, not the thread * lock, so the thread may still be in the process of being dequeued * from the runq. It will wait in invoke for the thread lock to be * dropped. */ return FALSE; } if (thread->sched_pri < BASEPRI_RTQUEUES) { return SCHED(processor_queue_remove)(processor, thread); } processor_set_t pset = processor->processor_set; pset_lock(pset); /* * Must re-read the thread runq after acquiring the pset lock, in * case another core swooped in before us to dequeue the thread. */ if (thread_get_runq_locked(thread) != PROCESSOR_NULL) { /* * Thread is on the RT run queue and we have a lock on * that run queue. */ rt_runq_remove(SCHED(rt_runq)(pset), thread); pset_update_rt_stealable_state(pset); removed = TRUE; } pset_unlock(pset); return removed; } /* * Put the thread back where it goes after a thread_run_queue_remove * * Thread must have been removed under the same thread lock hold * * thread locked, at splsched */ void thread_run_queue_reinsert(thread_t thread, sched_options_t options) { thread_assert_runq_null(thread); assert(thread->state & (TH_RUN)); thread_setrun(thread, options); } void sys_override_cpu_throttle(boolean_t enable_override) { if (enable_override) { cpu_throttle_enabled = 0; } else { cpu_throttle_enabled = 1; } } thread_urgency_t thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2) { uint64_t urgency_param1 = 0, urgency_param2 = 0; task_t task = get_threadtask_early(thread); thread_urgency_t urgency; if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) { urgency_param1 = 0; urgency_param2 = 0; urgency = THREAD_URGENCY_NONE; } else if (thread->sched_mode == TH_MODE_REALTIME) { urgency_param1 = thread->realtime.period; urgency_param2 = thread->realtime.deadline; urgency = THREAD_URGENCY_REAL_TIME; } else if (cpu_throttle_enabled && (thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE)) { /* * Threads that are running at low priority but are not * tagged with a specific QoS are separated out from * the "background" urgency. Performance management * subsystem can decide to either treat these threads * as normal threads or look at other signals like thermal * levels for optimal power/perf tradeoffs for a platform. */ boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread); boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1); /* * Background urgency applied when thread priority is * MAXPRI_THROTTLE or lower and thread is not promoted * and thread has a QoS specified */ urgency_param1 = thread->sched_pri; urgency_param2 = thread->base_pri; if (thread_lacks_qos && !task_is_suppressed) { urgency = THREAD_URGENCY_LOWPRI; } else { urgency = THREAD_URGENCY_BACKGROUND; } } else { /* For otherwise unclassified threads, report throughput QoS parameters */ urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS); urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS); urgency = THREAD_URGENCY_NORMAL; } if (arg1 != NULL) { *arg1 = urgency_param1; } if (arg2 != NULL) { *arg2 = urgency_param2; } return urgency; } perfcontrol_class_t thread_get_perfcontrol_class(thread_t thread) { /* Special case handling */ if (thread->state & TH_IDLE) { return PERFCONTROL_CLASS_IDLE; } if (thread->sched_mode == TH_MODE_REALTIME) { return PERFCONTROL_CLASS_REALTIME; } /* perfcontrol_class based on base_pri */ if (thread->base_pri <= MAXPRI_THROTTLE) { return PERFCONTROL_CLASS_BACKGROUND; } else if (thread->base_pri <= BASEPRI_UTILITY) { return PERFCONTROL_CLASS_UTILITY; } else if (thread->base_pri <= BASEPRI_DEFAULT) { return PERFCONTROL_CLASS_NONUI; } else if (thread->base_pri <= BASEPRI_USER_INITIATED) { return PERFCONTROL_CLASS_USER_INITIATED; } else if (thread->base_pri <= BASEPRI_FOREGROUND) { return PERFCONTROL_CLASS_UI; } else { if (get_threadtask(thread) == kernel_task) { /* * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL. * All other lower priority kernel threads should be treated * as regular threads for performance control purposes. */ return PERFCONTROL_CLASS_KERNEL; } return PERFCONTROL_CLASS_ABOVEUI; } } /* * This is the processor idle loop, which just looks for other threads * to execute. Processor idle threads invoke this without supplying a * current thread to idle without an asserted wait state. * * Returns a the next thread to execute if dispatched directly. */ #if 0 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__) #else #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0) #endif #if (DEVELOPMENT || DEBUG) int sched_idle_delay_cpuid = -1; #endif thread_t processor_idle( thread_t thread, processor_t processor) { processor_set_t pset = processor->processor_set; struct recount_snap snap = { 0 }; (void)splsched(); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START, (uintptr_t)thread_tid(thread), 0, 0, 0, 0); SCHED_STATS_INC(idle_transitions); assert(processor->running_timers_active == false); recount_snapshot(&snap); recount_processor_idle(&processor->pr_recount, &snap); while (1) { /* * Ensure that updates to my processor and pset state, * made by the IPI source processor before sending the IPI, * are visible on this processor now (even though we don't * take the pset lock yet). */ atomic_thread_fence(memory_order_acquire); if (processor->state != PROCESSOR_IDLE) { break; } if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) { break; } #if defined(CONFIG_SCHED_DEFERRED_AST) if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) { break; } #endif if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { break; } if (processor->is_recommended && (processor->processor_primary == processor)) { if (rt_runq_count(pset)) { break; } } else { if (SCHED(processor_bound_count)(processor)) { break; } } IDLE_KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0); machine_track_platform_idle(TRUE); machine_idle(); /* returns with interrupts enabled */ machine_track_platform_idle(FALSE); #if (DEVELOPMENT || DEBUG) if (processor->cpu_id == sched_idle_delay_cpuid) { delay(500); } #endif (void)splsched(); atomic_thread_fence(memory_order_acquire); IDLE_KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0); /* * Check if we should call sched_timeshare_consider_maintenance() here. * The CPU was woken out of idle due to an interrupt and we should do the * call only if the processor is still idle. If the processor is non-idle, * the threads running on the processor would do the call as part of * context swithing. */ if (processor->state == PROCESSOR_IDLE) { sched_timeshare_consider_maintenance(mach_absolute_time(), true); } if (!SCHED(processor_queue_empty)(processor)) { /* Secondary SMT processors respond to directed wakeups * exclusively. Some platforms induce 'spurious' SMT wakeups. */ if (processor->processor_primary == processor) { break; } } } recount_snapshot(&snap); recount_processor_run(&processor->pr_recount, &snap); smr_cpu_join(processor, snap.rsn_time_mach); ast_t reason = AST_NONE; /* We're handling all scheduling AST's */ ast_off(AST_SCHEDULING); /* * thread_select will move the processor from dispatching to running, * or put it in idle if there's nothing to do. */ thread_t cur_thread = current_thread(); thread_lock(cur_thread); thread_t new_thread = thread_select(cur_thread, processor, &reason); thread_unlock(cur_thread); assert(processor->running_timers_active == false); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0); return new_thread; } /* * Each processor has a dedicated thread which * executes the idle loop when there is no suitable * previous context. * * This continuation is entered with interrupts disabled. */ void idle_thread(__assert_only void* parameter, __unused wait_result_t result) { assert(ml_get_interrupts_enabled() == FALSE); assert(parameter == NULL); processor_t processor = current_processor(); smr_cpu_leave(processor, processor->last_dispatch); /* * Ensure that anything running in idle context triggers * preemption-disabled checks. */ disable_preemption_without_measurements(); /* * Enable interrupts temporarily to handle any pending interrupts * or IPIs before deciding to sleep */ spllo(); thread_t new_thread = processor_idle(THREAD_NULL, processor); /* returns with interrupts disabled */ enable_preemption(); if (new_thread != THREAD_NULL) { thread_run(processor->idle_thread, idle_thread, NULL, new_thread); /*NOTREACHED*/ } thread_block(idle_thread); /*NOTREACHED*/ } void idle_thread_create( processor_t processor, thread_continue_t continuation) { kern_return_t result; thread_t thread; spl_t s; char name[MAXTHREADNAMESIZE]; result = kernel_thread_create(continuation, NULL, MAXPRI_KERNEL, &thread); if (result != KERN_SUCCESS) { panic("idle_thread_create failed: %d", result); } snprintf(name, sizeof(name), "idle #%d", processor->cpu_id); thread_set_thread_name(thread, name); s = splsched(); thread_lock(thread); thread->bound_processor = processor; thread->chosen_processor = processor; processor->idle_thread = thread; thread->sched_pri = thread->base_pri = IDLEPRI; thread->state = (TH_RUN | TH_IDLE); thread->options |= TH_OPT_IDLE_THREAD; thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time(); thread_unlock(thread); splx(s); thread_deallocate(thread); } /* * sched_startup: * * Kicks off scheduler services. * * Called at splsched. */ void sched_startup(void) { kern_return_t result; thread_t thread; simple_lock_init(&sched_vm_group_list_lock, 0); result = kernel_thread_start_priority((thread_continue_t)sched_init_thread, NULL, MAXPRI_KERNEL, &thread); if (result != KERN_SUCCESS) { panic("sched_startup"); } thread_deallocate(thread); assert_thread_magic(thread); /* * Yield to the sched_init_thread once, to * initialize our own thread after being switched * back to. * * The current thread is the only other thread * active at this point. */ thread_block(THREAD_CONTINUE_NULL); assert_thread_magic(thread); } #if __arm64__ static _Atomic uint64_t sched_perfcontrol_callback_deadline; #endif /* __arm64__ */ #if defined(CONFIG_SCHED_TIMESHARE_CORE) static volatile uint64_t sched_maintenance_deadline; static uint64_t sched_tick_last_abstime; static uint64_t sched_tick_delta; uint64_t sched_tick_max_delta; /* * sched_init_thread: * * Perform periodic bookkeeping functions about ten * times per second. */ void sched_timeshare_maintenance_continue(void) { uint64_t sched_tick_ctime, late_time; struct sched_update_scan_context scan_context = { .earliest_bg_make_runnable_time = UINT64_MAX, .earliest_normal_make_runnable_time = UINT64_MAX, .earliest_rt_make_runnable_time = UINT64_MAX }; sched_tick_ctime = mach_absolute_time(); if (__improbable(sched_tick_last_abstime == 0)) { sched_tick_last_abstime = sched_tick_ctime; late_time = 0; sched_tick_delta = 1; } else { late_time = sched_tick_ctime - sched_tick_last_abstime; sched_tick_delta = late_time / sched_tick_interval; /* Ensure a delta of 1, since the interval could be slightly * smaller than the sched_tick_interval due to dispatch * latencies. */ sched_tick_delta = MAX(sched_tick_delta, 1); /* In the event interrupt latencies or platform * idle events that advanced the timebase resulted * in periods where no threads were dispatched, * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA * iterations. */ sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA); sched_tick_last_abstime = sched_tick_ctime; sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta); } scan_context.sched_tick_last_abstime = sched_tick_last_abstime; KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START, sched_tick_delta, late_time, 0, 0, 0); /* Add a number of pseudo-ticks corresponding to the elapsed interval * This could be greater than 1 if substantial intervals where * all processors are idle occur, which rarely occurs in practice. */ sched_tick += sched_tick_delta; update_vm_info(); /* * Compute various averages. */ compute_averages(sched_tick_delta); /* * Scan the run queues for threads which * may need to be updated, and find the earliest runnable thread on the runqueue * to report its latency. */ SCHED(thread_update_scan)(&scan_context); SCHED(rt_runq_scan)(&scan_context); uint64_t ctime = mach_absolute_time(); uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ? ctime - scan_context.earliest_bg_make_runnable_time : 0; uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ? ctime - scan_context.earliest_normal_make_runnable_time : 0; uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ? ctime - scan_context.earliest_rt_make_runnable_time : 0; machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency); /* * Check to see if the special sched VM group needs attention. */ sched_vm_group_maintenance(); #if __arm64__ /* Check to see if the recommended cores failsafe is active */ sched_recommended_cores_maintenance(); #endif /* __arm64__ */ #if DEBUG || DEVELOPMENT #if __x86_64__ #include /* Check for long-duration interrupts */ mp_interrupt_watchdog(); #endif /* __x86_64__ */ #endif /* DEBUG || DEVELOPMENT */ KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END, sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG], sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0); assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT); thread_block((thread_continue_t)sched_timeshare_maintenance_continue); /*NOTREACHED*/ } static uint64_t sched_maintenance_wakeups; /* * Determine if the set of routines formerly driven by a maintenance timer * must be invoked, based on a deadline comparison. Signals the scheduler * maintenance thread on deadline expiration. Must be invoked at an interval * lower than the "sched_tick_interval", currently accomplished by * invocation via the quantum expiration timer and at context switch time. * Performance matters: this routine reuses a timestamp approximating the * current absolute time received from the caller, and should perform * no more than a comparison against the deadline in the common case. */ void sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point) { uint64_t deadline = sched_maintenance_deadline; if (__improbable(ctime >= deadline)) { if (__improbable(current_thread() == sched_maintenance_thread)) { return; } OSMemoryBarrier(); uint64_t ndeadline = ctime + sched_tick_interval; if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) { thread_wakeup((event_t)sched_timeshare_maintenance_continue); sched_maintenance_wakeups++; smr_maintenance(ctime); } } smr_cpu_tick(ctime, safe_point); #if !CONFIG_SCHED_CLUTCH /* * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch * scheduler, the load is maintained at the thread group and bucket level. */ uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed); if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) { uint64_t new_deadline = 0; if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) { compute_sched_load(); new_deadline = ctime + sched_load_compute_interval_abs; os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed); } } #endif /* CONFIG_SCHED_CLUTCH */ #if __arm64__ uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed); if (__improbable(perf_deadline && ctime >= perf_deadline)) { /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */ if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) { machine_perfcontrol_deadline_passed(perf_deadline); } } #endif /* __arm64__ */ } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ void sched_init_thread(void) { thread_block(THREAD_CONTINUE_NULL); thread_t thread = current_thread(); thread_set_thread_name(thread, "sched_maintenance_thread"); sched_maintenance_thread = thread; SCHED(maintenance_continuation)(); /*NOTREACHED*/ } #if defined(CONFIG_SCHED_TIMESHARE_CORE) /* * thread_update_scan / runq_scan: * * Scan the run queues to account for timesharing threads * which need to be updated. * * Scanner runs in two passes. Pass one squirrels likely * threads away in an array, pass two does the update. * * This is necessary because the run queue is locked for * the candidate scan, but the thread is locked for the update. * * Array should be sized to make forward progress, without * disabling preemption for long periods. */ #define THREAD_UPDATE_SIZE 128 static thread_t thread_update_array[THREAD_UPDATE_SIZE]; static uint32_t thread_update_count = 0; /* Returns TRUE if thread was added, FALSE if thread_update_array is full */ boolean_t thread_update_add_thread(thread_t thread) { if (thread_update_count == THREAD_UPDATE_SIZE) { return FALSE; } thread_update_array[thread_update_count++] = thread; thread_reference(thread); return TRUE; } void thread_update_process_threads(void) { assert(thread_update_count <= THREAD_UPDATE_SIZE); for (uint32_t i = 0; i < thread_update_count; i++) { thread_t thread = thread_update_array[i]; assert_thread_magic(thread); thread_update_array[i] = THREAD_NULL; spl_t s = splsched(); thread_lock(thread); if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) { SCHED(update_priority)(thread); } thread_unlock(thread); splx(s); thread_deallocate(thread); } thread_update_count = 0; } static boolean_t runq_scan_thread( thread_t thread, sched_update_scan_context_t scan_context) { assert_thread_magic(thread); if (thread->sched_stamp != sched_tick && thread->sched_mode == TH_MODE_TIMESHARE) { if (thread_update_add_thread(thread) == FALSE) { return TRUE; } } if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) { if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) { scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time; } } else { if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) { scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time; } } return FALSE; } /* * Scan a runq for candidate threads. * * Returns TRUE if retry is needed. */ boolean_t runq_scan( run_queue_t runq, sched_update_scan_context_t scan_context) { int count = runq->count; int queue_index; assert(count >= 0); if (count == 0) { return FALSE; } for (queue_index = bitmap_first(runq->bitmap, NRQS); queue_index >= 0; queue_index = bitmap_next(runq->bitmap, queue_index)) { thread_t thread; circle_queue_t queue = &runq->queues[queue_index]; cqe_foreach_element(thread, queue, runq_links) { assert(count > 0); if (runq_scan_thread(thread, scan_context) == TRUE) { return TRUE; } count--; } } return FALSE; } #if CONFIG_SCHED_CLUTCH boolean_t sched_clutch_timeshare_scan( queue_t thread_queue, uint16_t thread_count, sched_update_scan_context_t scan_context) { if (thread_count == 0) { return FALSE; } thread_t thread; qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) { if (runq_scan_thread(thread, scan_context) == TRUE) { return TRUE; } thread_count--; } assert(thread_count == 0); return FALSE; } #endif /* CONFIG_SCHED_CLUTCH */ #endif /* CONFIG_SCHED_TIMESHARE_CORE */ bool thread_is_eager_preempt(thread_t thread) { return thread->sched_flags & TH_SFLAG_EAGERPREEMPT; } void thread_set_eager_preempt(thread_t thread) { spl_t s = splsched(); thread_lock(thread); assert(!thread_is_eager_preempt(thread)); thread->sched_flags |= TH_SFLAG_EAGERPREEMPT; if (thread == current_thread()) { /* csw_check updates current_is_eagerpreempt on the processor */ ast_t ast = csw_check(thread, current_processor(), AST_NONE); thread_unlock(thread); if (ast != AST_NONE) { thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast); } } else { processor_t last_processor = thread->last_processor; if (last_processor != PROCESSOR_NULL && last_processor->state == PROCESSOR_RUNNING && last_processor->active_thread == thread) { cause_ast_check(last_processor); } thread_unlock(thread); } splx(s); } void thread_clear_eager_preempt(thread_t thread) { spl_t s = splsched(); thread_lock(thread); assert(thread_is_eager_preempt(thread)); thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT; if (thread == current_thread()) { current_processor()->current_is_eagerpreempt = false; } thread_unlock(thread); splx(s); } /* * Scheduling statistics */ void sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri) { struct sched_statistics *stats; boolean_t to_realtime = FALSE; stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor); stats->csw_count++; if (otherpri >= BASEPRI_REALTIME) { stats->rt_sched_count++; to_realtime = TRUE; } if ((reasons & AST_PREEMPT) != 0) { stats->preempt_count++; if (selfpri >= BASEPRI_REALTIME) { stats->preempted_rt_count++; } if (to_realtime) { stats->preempted_by_rt_count++; } } } void sched_stats_handle_runq_change(struct runq_stats *stats, int old_count) { uint64_t timestamp = mach_absolute_time(); stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count; stats->last_change_timestamp = timestamp; } /* * For calls from assembly code */ #undef thread_wakeup void thread_wakeup( event_t x); void thread_wakeup( event_t x) { thread_wakeup_with_result(x, THREAD_AWAKENED); } boolean_t preemption_enabled(void) { return get_preemption_level() == 0 && ml_get_interrupts_enabled(); } static void sched_timer_deadline_tracking_init(void) { nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1); nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2); } /* * Check that all CPUs are successfully powered up in places where that's expected. */ static void check_all_cpus_are_done_starting(processor_start_kind_t start_kind) { /* * `processor_count` may include registered CPUs above cpus= or cpumask= limit. * Use machine_info.logical_cpu_max for the CPU IDs that matter. */ for (int cpu_id = 0; cpu_id < machine_info.logical_cpu_max; cpu_id++) { processor_t processor = processor_array[cpu_id]; processor_wait_for_start(processor, start_kind); } } /* * Find some available online CPU that threads can be enqueued on * * Called with the sched_available_cores_lock held */ static int sched_last_resort_cpu(void) { simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED); int last_resort_cpu = lsb_first(pcs.pcs_effective.pcs_online_cores); if (last_resort_cpu == -1) { panic("no last resort cpu found!"); } return last_resort_cpu; } static void assert_no_processors_in_transition_locked() { assert(pcs.pcs_in_kernel_sleep == false); /* All processors must be either running or offline */ assert(pcs.pcs_managed_cores == (processor_offline_state_map[PROCESSOR_OFFLINE_RUNNING] | processor_offline_state_map[PROCESSOR_OFFLINE_FULLY_OFFLINE])); /* All state transitions must be quiesced at this point */ assert(pcs.pcs_effective.pcs_online_cores == processor_offline_state_map[PROCESSOR_OFFLINE_RUNNING]); } static struct powered_cores_state sched_compute_requested_powered_cores() { simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED); struct powered_cores_state output = { .pcs_online_cores = pcs.pcs_managed_cores, .pcs_powerdown_recommended_cores = pcs.pcs_managed_cores, .pcs_tempdown_cores = 0, }; if (!pcs.pcs_init_completed) { return output; } /* * if we unify this with derecommendation, note that only sleep should stop derecommendation, * not dtrace et al */ if (pcs.pcs_powerdown_suspend_count) { return output; } else { /* * The cores power clients like ANE require or * the kernel cannot offline */ cpumap_t system_required_powered_cores = pcs.pcs_required_online_pmgr | pcs.pcs_required_online_system; cpumap_t online_cores_goal; if (pcs.pcs_user_online_core_control) { /* This is our new goal state for powered cores */ output.pcs_powerdown_recommended_cores = pcs.pcs_requested_online_user; online_cores_goal = pcs.pcs_requested_online_user | system_required_powered_cores; } else { /* Remove the cores CLPC wants to power down */ cpumap_t clpc_wanted_powered_cores = pcs.pcs_managed_cores; clpc_wanted_powered_cores &= pcs.pcs_requested_online_clpc_user; clpc_wanted_powered_cores &= pcs.pcs_requested_online_clpc_system; output.pcs_powerdown_recommended_cores = clpc_wanted_powered_cores; online_cores_goal = clpc_wanted_powered_cores | system_required_powered_cores; /* Any cores in managed cores that are not in wanted powered become temporary */ output.pcs_tempdown_cores = (pcs.pcs_managed_cores & ~clpc_wanted_powered_cores); /* Future: Treat CLPC user/system separately. */ } if (online_cores_goal == 0) { /* * If we're somehow trying to disable all CPUs, * force online the lowest numbered CPU. */ online_cores_goal = BIT(lsb_first(pcs.pcs_managed_cores)); } #if RHODES_CLUSTER_POWERDOWN_WORKAROUND /* * Because warm CPU boot from WFI is not currently implemented, * we cannot power down only one CPU in a cluster, so we force up * all the CPUs in the cluster if any one CPU is up in the cluster. * Once all CPUs are disabled, then the whole cluster goes down at once. */ cpumap_t workaround_online_cores = 0; const ml_topology_info_t* topology = ml_get_topology_info(); for (unsigned int i = 0; i < topology->num_clusters; i++) { ml_topology_cluster_t* cluster = &topology->clusters[i]; if ((cluster->cpu_mask & online_cores_goal) != 0) { workaround_online_cores |= cluster->cpu_mask; } } online_cores_goal = workaround_online_cores; #endif /* RHODES_CLUSTER_POWERDOWN_WORKAROUND */ output.pcs_online_cores = online_cores_goal; } return output; } static bool sched_needs_update_requested_powered_cores() { if (!pcs.pcs_init_completed) { return false; } struct powered_cores_state requested = sched_compute_requested_powered_cores(); struct powered_cores_state effective = pcs.pcs_effective; if (requested.pcs_powerdown_recommended_cores != effective.pcs_powerdown_recommended_cores || requested.pcs_online_cores != effective.pcs_online_cores || requested.pcs_tempdown_cores != effective.pcs_tempdown_cores) { return true; } else { return false; } } kern_return_t sched_processor_exit_user(processor_t processor) { assert(processor); lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED); assert(preemption_enabled()); kern_return_t result; spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); if (!enable_processor_exit) { /* This API is not supported on this device. */ result = KERN_NOT_SUPPORTED; goto unlock; } if (bit_test(pcs.pcs_required_online_system, processor->cpu_id)) { /* This CPU can never change state outside of sleep. */ result = KERN_NOT_SUPPORTED; goto unlock; } /* * Future: Instead of failing, simulate the processor * being shut down via derecommendation and decrementing active count. */ if (bit_test(pcs.pcs_required_online_pmgr, processor->cpu_id)) { /* PMGR won't let us power down this CPU right now. */ result = KERN_FAILURE; goto unlock; } if (pcs.pcs_powerdown_suspend_count) { /* A tool that disables CPU powerdown is active. */ result = KERN_FAILURE; goto unlock; } if (!bit_test(pcs.pcs_requested_online_user, processor->cpu_id)) { /* The CPU is already powered off by userspace. */ result = KERN_NODE_DOWN; goto unlock; } if ((pcs.pcs_recommended_cores & pcs.pcs_effective.pcs_online_cores) == BIT(processor->cpu_id)) { /* This is the last available core, can't shut it down. */ result = KERN_RESOURCE_SHORTAGE; goto unlock; } result = KERN_SUCCESS; if (!pcs.pcs_user_online_core_control) { pcs.pcs_user_online_core_control = true; } bit_clear(pcs.pcs_requested_online_user, processor->cpu_id); if (sched_needs_update_requested_powered_cores()) { sched_update_powered_cores_drops_lock(REASON_USER, s); } unlock: simple_unlock(&sched_available_cores_lock); splx(s); return result; } kern_return_t sched_processor_start_user(processor_t processor) { assert(processor); lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED); assert(preemption_enabled()); kern_return_t result; spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); if (!enable_processor_exit) { result = KERN_NOT_SUPPORTED; goto unlock; } if (bit_test(pcs.pcs_required_online_system, processor->cpu_id)) { result = KERN_NOT_SUPPORTED; goto unlock; } /* Not allowed to start an SMT processor while SMT is disabled */ if ((sched_enable_smt == 0) && (processor->processor_primary != processor)) { result = KERN_FAILURE; goto unlock; } if (pcs.pcs_powerdown_suspend_count) { result = KERN_FAILURE; goto unlock; } if (bit_test(pcs.pcs_requested_online_user, processor->cpu_id)) { result = KERN_FAILURE; goto unlock; } result = KERN_SUCCESS; bit_set(pcs.pcs_requested_online_user, processor->cpu_id); /* * Once the user puts all CPUs back online, * we can resume automatic cluster power down. */ if (pcs.pcs_requested_online_user == pcs.pcs_managed_cores) { pcs.pcs_user_online_core_control = false; } if (sched_needs_update_requested_powered_cores()) { sched_update_powered_cores_drops_lock(REASON_USER, s); } unlock: simple_unlock(&sched_available_cores_lock); splx(s); return result; } sched_cond_atomic_t sched_update_powered_cores_wakeup; thread_t sched_update_powered_cores_thread; static void OS_NORETURN sched_update_powered_cores_continue(void *param __unused, wait_result_t wr __unused); /* * After all processors have been ml_processor_register'ed and processor_boot'ed * the scheduler can finalize its datastructures and allow CPU power state changes. * * Enforce that this only happens *once*. More than once is definitely not OK. rdar://121270513 */ void sched_cpu_init_completed(void) { static bool sched_cpu_init_completed_called = false; if (!os_atomic_cmpxchg(&sched_cpu_init_completed_called, false, true, relaxed)) { panic("sched_cpu_init_completed called twice! %d", sched_cpu_init_completed_called); } if (SCHED(cpu_init_completed) != NULL) { SCHED(cpu_init_completed)(); } /* Wait for any cpu that is still starting, and enforce that they eventually complete. */ check_all_cpus_are_done_starting(PROCESSOR_FIRST_BOOT); lck_mtx_lock(&cluster_powerdown_lock); assert(sched_update_powered_cores_thread == THREAD_NULL); sched_cond_init(&sched_update_powered_cores_wakeup); kern_return_t result = kernel_thread_start_priority( sched_update_powered_cores_continue, NULL, MAXPRI_KERNEL, &sched_update_powered_cores_thread); if (result != KERN_SUCCESS) { panic("failed to create sched_update_powered_cores thread"); } thread_set_thread_name(sched_update_powered_cores_thread, "sched_update_powered_cores"); spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); assert(pcs.pcs_init_completed == false); pcs.pcs_managed_cores = pcs.pcs_effective.pcs_online_cores; assert(__builtin_popcountll(pcs.pcs_managed_cores) == machine_info.logical_cpu_max); /* If CLPC tries to cluster power down before this point, it's ignored. */ pcs.pcs_requested_online_user = pcs.pcs_managed_cores; pcs.pcs_requested_online_clpc_system = pcs.pcs_managed_cores; pcs.pcs_requested_online_clpc_user = pcs.pcs_managed_cores; cpumap_t system_required_cores = 0; /* * Ask the platform layer which CPUs are allowed to * be powered off outside of system sleep. */ for (int cpu_id = 0; cpu_id < machine_info.logical_cpu_max; cpu_id++) { if (!ml_cpu_can_exit(cpu_id)) { bit_set(system_required_cores, cpu_id); } } pcs.pcs_required_online_system = system_required_cores; pcs.pcs_effective.pcs_powerdown_recommended_cores = pcs.pcs_managed_cores; pcs.pcs_requested = sched_compute_requested_powered_cores(); assert(pcs.pcs_requested.pcs_powerdown_recommended_cores == pcs.pcs_managed_cores); assert(pcs.pcs_requested.pcs_online_cores == pcs.pcs_managed_cores); assert(pcs.pcs_requested.pcs_tempdown_cores == 0); assert(pcs.pcs_effective.pcs_powerdown_recommended_cores == pcs.pcs_managed_cores); assert(pcs.pcs_effective.pcs_online_cores == pcs.pcs_managed_cores); assert(pcs.pcs_effective.pcs_tempdown_cores == 0); pcs.pcs_init_completed = true; simple_unlock(&sched_available_cores_lock); splx(s); lck_mtx_unlock(&cluster_powerdown_lock); /* Release the +1 pcs_powerdown_suspend_count that we booted up with. */ resume_cluster_powerdown(); } bool sched_is_in_sleep(void) { return pcs.pcs_in_kernel_sleep || pcs.pcs_wants_kernel_sleep; } bool sched_is_cpu_init_completed(void) { return pcs.pcs_init_completed; } processor_reason_t last_sched_update_powered_cores_continue_reason; static void OS_NORETURN sched_update_powered_cores_continue(void *param __unused, wait_result_t wr __unused) { sched_cond_ack(&sched_update_powered_cores_wakeup); while (true) { lck_mtx_lock(&cluster_powerdown_lock); spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); bool needs_update = sched_needs_update_requested_powered_cores(); if (needs_update) { /* This thread shouldn't need to make changes while powerdown is suspended */ assert(pcs.pcs_powerdown_suspend_count == 0); processor_reason_t reason = last_sched_update_powered_cores_continue_reason; sched_update_powered_cores_drops_lock(reason, s); } simple_unlock(&sched_available_cores_lock); splx(s); lck_mtx_unlock(&cluster_powerdown_lock); /* If we did an update, we dropped the lock, so check again. */ if (!needs_update) { sched_cond_wait(&sched_update_powered_cores_wakeup, THREAD_UNINT, sched_update_powered_cores_continue); /* The condition was signaled since we last blocked, check again. */ } } } __options_decl(sched_powered_cores_flags_t, uint32_t, { ASSERT_IN_SLEEP = 0x10000000, ASSERT_POWERDOWN_SUSPENDED = 0x20000000, POWERED_CORES_OPTIONS_MASK = ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED, }); /* * This is KPI with CLPC. */ void sched_perfcontrol_update_powered_cores( uint64_t requested_powered_cores, processor_reason_t reason, __unused uint32_t flags) { assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER)); #if DEVELOPMENT || DEBUG if (flags & (ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED)) { if (flags & ASSERT_POWERDOWN_SUSPENDED) { assert(pcs.pcs_powerdown_suspend_count > 0); } if (flags & ASSERT_IN_SLEEP) { assert(pcs.pcs_sleep_override_recommended == true); } return; } #endif spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); cpumap_t requested_cores = requested_powered_cores & pcs.pcs_managed_cores; if (reason == REASON_CLPC_SYSTEM) { pcs.pcs_requested_online_clpc_system = requested_cores; } else if (reason == REASON_CLPC_USER) { pcs.pcs_requested_online_clpc_user = requested_cores; } bool needs_update = sched_needs_update_requested_powered_cores(); if (needs_update) { last_sched_update_powered_cores_continue_reason = reason; } simple_unlock(&sched_available_cores_lock); splx(s); if (needs_update) { sched_cond_signal(&sched_update_powered_cores_wakeup, sched_update_powered_cores_thread); } } /* * This doesn't just suspend cluster powerdown. * It also powers up all the cores and leaves them up, * even if some user wanted them down. * This is important because dtrace, monotonic, and others can't handle any * powered down cores, not just cluster powerdown. */ static void suspend_cluster_powerdown_locked(bool for_sleep) { lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED); kprintf("%s>calling sched_update_powered_cores to suspend powerdown\n", __func__); spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); assert(pcs.pcs_powerdown_suspend_count >= 0); if (for_sleep) { assert(!pcs.pcs_wants_kernel_sleep); assert(!pcs.pcs_in_kernel_sleep); pcs.pcs_wants_kernel_sleep = true; } pcs.pcs_powerdown_suspend_count++; if (sched_needs_update_requested_powered_cores()) { sched_update_powered_cores_drops_lock(REASON_SYSTEM, s); } if (for_sleep) { assert(pcs.pcs_wants_kernel_sleep); assert(!pcs.pcs_in_kernel_sleep); pcs.pcs_in_kernel_sleep = true; assert(sched_needs_update_requested_powered_cores() == false); } simple_unlock(&sched_available_cores_lock); splx(s); if (pcs.pcs_init_completed) { /* At this point, no cpu should be still starting. Let's enforce that. */ check_all_cpus_are_done_starting(for_sleep ? PROCESSOR_BEFORE_ENTERING_SLEEP : PROCESSOR_CLUSTER_POWERDOWN_SUSPEND); } } static void resume_cluster_powerdown_locked(bool for_sleep) { lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED); if (pcs.pcs_init_completed) { /* At this point, no cpu should be still starting. Let's enforce that. */ check_all_cpus_are_done_starting(for_sleep ? PROCESSOR_WAKE_FROM_SLEEP : PROCESSOR_CLUSTER_POWERDOWN_RESUME); } kprintf("%s>calling sched_update_powered_cores to resume powerdown\n", __func__); spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); if (pcs.pcs_powerdown_suspend_count <= 0) { panic("resume_cluster_powerdown() called with pcs.pcs_powerdown_suspend_count=%d\n", pcs.pcs_powerdown_suspend_count); } if (for_sleep) { assert(pcs.pcs_wants_kernel_sleep); assert(pcs.pcs_in_kernel_sleep); pcs.pcs_wants_kernel_sleep = false; } pcs.pcs_powerdown_suspend_count--; if (pcs.pcs_powerdown_suspend_count == 0) { /* Returning to client controlled powerdown mode */ assert(pcs.pcs_init_completed); /* To match previous behavior, clear the user state */ pcs.pcs_requested_online_user = pcs.pcs_managed_cores; pcs.pcs_user_online_core_control = false; /* To match previous behavior, clear the requested CLPC state. */ pcs.pcs_requested_online_clpc_user = pcs.pcs_managed_cores; pcs.pcs_requested_online_clpc_system = pcs.pcs_managed_cores; } if (sched_needs_update_requested_powered_cores()) { sched_update_powered_cores_drops_lock(REASON_SYSTEM, s); } if (for_sleep) { assert(!pcs.pcs_wants_kernel_sleep); assert(pcs.pcs_in_kernel_sleep); pcs.pcs_in_kernel_sleep = false; assert(sched_needs_update_requested_powered_cores() == false); } simple_unlock(&sched_available_cores_lock); splx(s); } static uint64_t die_and_cluster_to_cpu_mask( __unused unsigned int die_id, __unused unsigned int die_cluster_id) { #if __arm__ || __arm64__ const ml_topology_info_t* topology = ml_get_topology_info(); unsigned int num_clusters = topology->num_clusters; for (unsigned int i = 0; i < num_clusters; i++) { ml_topology_cluster_t* cluster = &topology->clusters[i]; if ((cluster->die_id == die_id) && (cluster->die_cluster_id == die_cluster_id)) { return cluster->cpu_mask; } } #endif return 0ull; } /* * Take an assertion that ensures all CPUs in the cluster are powered up until * the assertion is released. * A system suspend will still power down the CPUs. * This call will stall if system suspend is in progress. * * Future ER: Could this just power up the cluster, and leave enabling the * processors to be asynchronous, or deferred? * * Enabling the rail is synchronous, it must be powered up before returning. */ void sched_enable_acc_rail(unsigned int die_id, unsigned int die_cluster_id) { uint64_t core_mask = die_and_cluster_to_cpu_mask(die_id, die_cluster_id); lck_mtx_lock(&cluster_powerdown_lock); /* * Note: if pcs.pcs_init_completed is false, because the * CPUs have not booted yet, then we assume that all * clusters are already powered up at boot (see IOCPUInitialize) * so we don't have to wait for cpu boot to complete. * We'll still save the requested assertion and enforce it after * boot completes. */ spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); if (pcs.pcs_init_completed) { assert3u(pcs.pcs_managed_cores & core_mask, ==, core_mask); } /* Can't enable something that is already enabled */ assert((pcs.pcs_required_online_pmgr & core_mask) == 0); pcs.pcs_required_online_pmgr |= core_mask; if (sched_needs_update_requested_powered_cores()) { sched_update_powered_cores_drops_lock(REASON_PMGR_SYSTEM, s); } simple_unlock(&sched_available_cores_lock); splx(s); lck_mtx_unlock(&cluster_powerdown_lock); } /* * Release the assertion ensuring the cluster is powered up. * This operation is asynchronous, so PMGR doesn't need to wait until it takes * effect. If the enable comes in before it takes effect, it'll either * wait on the lock, or the async thread will discover it needs no update. */ void sched_disable_acc_rail(unsigned int die_id, unsigned int die_cluster_id) { uint64_t core_mask = die_and_cluster_to_cpu_mask(die_id, die_cluster_id); spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); /* Can't disable something that is already disabled */ assert((pcs.pcs_required_online_pmgr & core_mask) == core_mask); if (pcs.pcs_init_completed) { assert3u(pcs.pcs_managed_cores & core_mask, ==, core_mask); } pcs.pcs_required_online_pmgr &= ~core_mask; bool needs_update = sched_needs_update_requested_powered_cores(); if (needs_update) { last_sched_update_powered_cores_continue_reason = REASON_PMGR_SYSTEM; } simple_unlock(&sched_available_cores_lock); splx(s); if (needs_update) { sched_cond_signal(&sched_update_powered_cores_wakeup, sched_update_powered_cores_thread); } } void suspend_cluster_powerdown(void) { lck_mtx_lock(&cluster_powerdown_lock); suspend_cluster_powerdown_locked(false); lck_mtx_unlock(&cluster_powerdown_lock); } void resume_cluster_powerdown(void) { lck_mtx_lock(&cluster_powerdown_lock); resume_cluster_powerdown_locked(false); lck_mtx_unlock(&cluster_powerdown_lock); if (sched_enable_smt == 0) { enable_smt_processors(false); } } LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp); static bool user_suspended_cluster_powerdown = false; kern_return_t suspend_cluster_powerdown_from_user(void) { kern_return_t ret = KERN_FAILURE; lck_mtx_lock(&user_cluster_powerdown_lock); if (!user_suspended_cluster_powerdown) { suspend_cluster_powerdown(); user_suspended_cluster_powerdown = true; ret = KERN_SUCCESS; } lck_mtx_unlock(&user_cluster_powerdown_lock); return ret; } kern_return_t resume_cluster_powerdown_from_user(void) { kern_return_t ret = KERN_FAILURE; lck_mtx_lock(&user_cluster_powerdown_lock); if (user_suspended_cluster_powerdown) { resume_cluster_powerdown(); user_suspended_cluster_powerdown = false; ret = KERN_SUCCESS; } lck_mtx_unlock(&user_cluster_powerdown_lock); return ret; } int get_cluster_powerdown_user_suspended(void) { lck_mtx_lock(&user_cluster_powerdown_lock); int ret = (int)user_suspended_cluster_powerdown; lck_mtx_unlock(&user_cluster_powerdown_lock); return ret; } #if DEVELOPMENT || DEBUG /* Functions to support the temporary sysctl */ static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED; void sched_set_powered_cores(int requested_powered_cores) { processor_reason_t reason = bit_test(requested_powered_cores, 31) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM; sched_powered_cores_flags_t flags = requested_powered_cores & POWERED_CORES_OPTIONS_MASK; saved_requested_powered_cores = requested_powered_cores; requested_powered_cores = bits(requested_powered_cores, 28, 0); sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags); } int sched_get_powered_cores(void) { return (int)saved_requested_powered_cores; } uint64_t sched_sysctl_get_recommended_cores(void) { return pcs.pcs_recommended_cores; } #endif /* * Ensure that all cores are powered and recommended before sleep * Acquires cluster_powerdown_lock and returns with it held. */ void sched_override_available_cores_for_sleep(void) { if (!pcs.pcs_init_completed) { panic("Attempting to sleep before all CPUS are registered"); } lck_mtx_lock(&cluster_powerdown_lock); spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); assert(pcs.pcs_sleep_override_recommended == false); pcs.pcs_sleep_override_recommended = true; sched_update_recommended_cores_locked(REASON_SYSTEM, 0); simple_unlock(&sched_available_cores_lock); splx(s); suspend_cluster_powerdown_locked(true); } /* * Restore the previously recommended cores, but leave all cores powered * after sleep. * Called with cluster_powerdown_lock still held, releases the lock. */ void sched_restore_available_cores_after_sleep(void) { lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED); spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); assert(pcs.pcs_sleep_override_recommended == true); pcs.pcs_sleep_override_recommended = false; sched_update_recommended_cores_locked(REASON_NONE, 0); simple_unlock(&sched_available_cores_lock); splx(s); resume_cluster_powerdown_locked(true); lck_mtx_unlock(&cluster_powerdown_lock); if (sched_enable_smt == 0) { enable_smt_processors(false); } } #if __arm__ || __arm64__ uint64_t perfcontrol_failsafe_maintenance_runnable_time; uint64_t perfcontrol_failsafe_activation_time; uint64_t perfcontrol_failsafe_deactivation_time; /* data covering who likely caused it and how long they ran */ #define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */ char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN]; int perfcontrol_failsafe_pid; uint64_t perfcontrol_failsafe_tid; uint64_t perfcontrol_failsafe_thread_timer_at_start; uint64_t perfcontrol_failsafe_thread_timer_last_seen; uint64_t perfcontrol_failsafe_recommended_at_trigger; /* * Perf controller calls here to update the recommended core bitmask. * If the failsafe is active, we don't immediately apply the new value. * Instead, we store the new request and use it after the failsafe deactivates. * * If the failsafe is not active, immediately apply the update. * * No scheduler locks are held, no other locks are held that scheduler might depend on, * interrupts are enabled * * currently prototype is in osfmk/arm/machine_routines.h */ void sched_perfcontrol_update_recommended_cores_reason( uint64_t recommended_cores, processor_reason_t reason, __unused uint32_t flags) { assert(preemption_enabled()); spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); if (reason == REASON_CLPC_SYSTEM) { pcs.pcs_requested_recommended_clpc_system = recommended_cores; } else { assert(reason == REASON_CLPC_USER); pcs.pcs_requested_recommended_clpc_user = recommended_cores; } pcs.pcs_requested_recommended_clpc = pcs.pcs_requested_recommended_clpc_system & pcs.pcs_requested_recommended_clpc_user; sysctl_sched_recommended_cores = pcs.pcs_requested_recommended_clpc; sched_update_recommended_cores_locked(reason, 0); simple_unlock(&sched_available_cores_lock); splx(s); } void sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores) { sched_perfcontrol_update_recommended_cores_reason(recommended_cores, REASON_CLPC_USER, 0); } /* * Consider whether we need to activate the recommended cores failsafe * * Called from quantum timer interrupt context of a realtime thread * No scheduler locks are held, interrupts are disabled */ void sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread) { /* * Check if a realtime thread is starving the system * and bringing up non-recommended cores would help * * TODO: Is this the correct check for recommended == possible cores? * TODO: Validate the checks without the relevant lock are OK. */ if (__improbable(pcs.pcs_recommended_clpc_failsafe_active)) { /* keep track of how long the responsible thread runs */ uint64_t cur_th_time = recount_current_thread_time_mach(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); if (pcs.pcs_recommended_clpc_failsafe_active && cur_thread->thread_id == perfcontrol_failsafe_tid) { perfcontrol_failsafe_thread_timer_last_seen = cur_th_time; } simple_unlock(&sched_available_cores_lock); /* we're already trying to solve the problem, so bail */ return; } /* The failsafe won't help if there are no more processors to enable */ if (__probable(bit_count(pcs.pcs_requested_recommended_clpc) >= processor_count)) { return; } uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold; /* Use the maintenance thread as our canary in the coal mine */ thread_t m_thread = sched_maintenance_thread; /* If it doesn't look bad, nothing to see here */ if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) { return; } /* It looks bad, take the lock to be sure */ thread_lock(m_thread); if (thread_get_runq(m_thread) == PROCESSOR_NULL || (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN || m_thread->last_made_runnable_time >= too_long_ago) { /* * Maintenance thread is either on cpu or blocked, and * therefore wouldn't benefit from more cores */ thread_unlock(m_thread); return; } uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time; thread_unlock(m_thread); /* * There are cores disabled at perfcontrol's recommendation, but the * system is so overloaded that the maintenance thread can't run. * That likely means that perfcontrol can't run either, so it can't fix * the recommendation. We have to kick in a failsafe to keep from starving. * * When the maintenance thread has been starved for too long, * ignore the recommendation from perfcontrol and light up all the cores. * * TODO: Consider weird states like boot, sleep, or debugger */ simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); if (pcs.pcs_recommended_clpc_failsafe_active) { simple_unlock(&sched_available_cores_lock); return; } KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START, pcs.pcs_requested_recommended_clpc, maintenance_runnable_time, 0, 0, 0); pcs.pcs_recommended_clpc_failsafe_active = true; perfcontrol_failsafe_activation_time = mach_absolute_time(); perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time; perfcontrol_failsafe_recommended_at_trigger = pcs.pcs_requested_recommended_clpc; /* Capture some data about who screwed up (assuming that the thread on core is at fault) */ task_t task = get_threadtask(cur_thread); perfcontrol_failsafe_pid = task_pid(task); strlcpy(perfcontrol_failsafe_name, proc_name_address(get_bsdtask_info(task)), sizeof(perfcontrol_failsafe_name)); perfcontrol_failsafe_tid = cur_thread->thread_id; /* Blame the thread for time it has run recently */ uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered; uint64_t last_seen = recount_current_thread_time_mach(); /* Compute the start time of the bad behavior in terms of the thread's on core time */ perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation; perfcontrol_failsafe_thread_timer_last_seen = last_seen; /* Publish the pcs_recommended_clpc_failsafe_active override to the CPUs */ sched_update_recommended_cores_locked(REASON_SYSTEM, 0); simple_unlock(&sched_available_cores_lock); } /* * Now that our bacon has been saved by the failsafe, consider whether to turn it off * * Runs in the context of the maintenance thread, no locks held */ static void sched_recommended_cores_maintenance(void) { /* Common case - no failsafe, nothing to be done here */ if (__probable(!pcs.pcs_recommended_clpc_failsafe_active)) { return; } uint64_t ctime = mach_absolute_time(); boolean_t print_diagnostic = FALSE; char p_name[FAILSAFE_NAME_LEN] = ""; spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); /* Check again, under the lock, to avoid races */ if (!pcs.pcs_recommended_clpc_failsafe_active) { goto out; } /* * Ensure that the other cores get another few ticks to run some threads * If we don't have this hysteresis, the maintenance thread is the first * to run, and then it immediately kills the other cores */ if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) { goto out; } /* Capture some diagnostic state under the lock so we can print it out later */ int pid = perfcontrol_failsafe_pid; uint64_t tid = perfcontrol_failsafe_tid; uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen - perfcontrol_failsafe_thread_timer_at_start; uint64_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger; uint64_t rec_cores_after = pcs.pcs_requested_recommended_clpc; uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time; strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name)); print_diagnostic = TRUE; /* Deactivate the failsafe and reinstate the requested recommendation settings */ perfcontrol_failsafe_deactivation_time = ctime; pcs.pcs_recommended_clpc_failsafe_active = false; sched_update_recommended_cores_locked(REASON_SYSTEM, 0); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END, pcs.pcs_requested_recommended_clpc, failsafe_duration, 0, 0, 0); out: simple_unlock(&sched_available_cores_lock); splx(s); if (print_diagnostic) { uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0; absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms); failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC; absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms); thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC; printf("recommended core failsafe kicked in for %lld ms " "likely due to %s[%d] thread 0x%llx spending " "%lld ms on cpu at realtime priority - " "new recommendation: 0x%llx -> 0x%llx\n", failsafe_duration_ms, p_name, pid, tid, thread_usage_ms, rec_cores_before, rec_cores_after); } } #endif /* __arm64__ */ /* * This is true before we have jumped to kernel_bootstrap_thread * first thread context during boot, or while all processors * have offlined during system sleep and the scheduler is disabled. * * (Note: only ever true on ARM, Intel doesn't actually offline the last CPU) */ bool sched_all_cpus_offline(void) { return pcs.pcs_effective.pcs_online_cores == 0; } void sched_assert_not_last_online_cpu(__assert_only int cpu_id) { assertf(pcs.pcs_effective.pcs_online_cores != BIT(cpu_id), "attempting to shut down the last online CPU!"); } /* * This is the unified single function to change published active core counts based on processor mode. * Each type of flag affects the other in terms of how the counts change. * * Future: Add support for not decrementing counts in 'temporary derecommended online' mode * Future: Shutdown for system sleep should be 'temporary' according to the user counts * so that no client sees a transiently low number of CPUs. */ void sched_processor_change_mode_locked(processor_t processor, processor_mode_t pcm_mode, bool set) { simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED); pset_assert_locked(processor->processor_set); switch (pcm_mode) { case PCM_RECOMMENDED: if (set) { assert(!processor->is_recommended); assert(!bit_test(pcs.pcs_recommended_cores, processor->cpu_id)); processor->is_recommended = true; bit_set(pcs.pcs_recommended_cores, processor->cpu_id); if (processor->processor_online) { os_atomic_inc(&processor_avail_count_user, relaxed); if (processor->processor_primary == processor) { os_atomic_inc(&primary_processor_avail_count_user, relaxed); } } } else { assert(processor->is_recommended); assert(bit_test(pcs.pcs_recommended_cores, processor->cpu_id)); processor->is_recommended = false; bit_clear(pcs.pcs_recommended_cores, processor->cpu_id); if (processor->processor_online) { os_atomic_dec(&processor_avail_count_user, relaxed); if (processor->processor_primary == processor) { os_atomic_dec(&primary_processor_avail_count_user, relaxed); } } } break; case PCM_TEMPORARY: if (set) { assert(!processor->shutdown_temporary); assert(!bit_test(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id)); processor->shutdown_temporary = true; bit_set(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id); if (!processor->processor_online) { goto counts_up; } } else { assert(processor->shutdown_temporary); assert(bit_test(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id)); processor->shutdown_temporary = false; bit_clear(pcs.pcs_effective.pcs_tempdown_cores, processor->cpu_id); if (!processor->processor_online) { goto counts_down; } } break; case PCM_ONLINE: if (set) { assert(!processor->processor_online); assert(!bit_test(pcs.pcs_effective.pcs_online_cores, processor->cpu_id)); processor->processor_online = true; bit_set(pcs.pcs_effective.pcs_online_cores, processor->cpu_id); if (!processor->shutdown_temporary) { goto counts_up; } } else { assert(processor->processor_online); assert(bit_test(pcs.pcs_effective.pcs_online_cores, processor->cpu_id)); processor->processor_online = false; bit_clear(pcs.pcs_effective.pcs_online_cores, processor->cpu_id); if (!processor->shutdown_temporary) { goto counts_down; } } break; default: panic("unknown mode %d", pcm_mode); } return; counts_up: ml_cpu_up_update_counts(processor->cpu_id); os_atomic_inc(&processor_avail_count, relaxed); if (processor->is_recommended) { os_atomic_inc(&processor_avail_count_user, relaxed); } if (processor->processor_primary == processor) { os_atomic_inc(&primary_processor_avail_count, relaxed); if (processor->is_recommended) { os_atomic_inc(&primary_processor_avail_count_user, relaxed); } } commpage_update_active_cpus(); return; counts_down: ml_cpu_down_update_counts(processor->cpu_id); os_atomic_dec(&processor_avail_count, relaxed); if (processor->is_recommended) { os_atomic_dec(&processor_avail_count_user, relaxed); } if (processor->processor_primary == processor) { os_atomic_dec(&primary_processor_avail_count, relaxed); if (processor->is_recommended) { os_atomic_dec(&primary_processor_avail_count_user, relaxed); } } commpage_update_active_cpus(); return; } bool sched_mark_processor_online(processor_t processor, __assert_only processor_reason_t reason) { assert(processor == current_processor()); processor_set_t pset = processor->processor_set; spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); pset_lock(pset); /* Boot CPU coming online for the first time, either at boot or after sleep */ bool is_first_online_processor = sched_all_cpus_offline(); if (is_first_online_processor) { assert(processor == master_processor); } assert((processor != master_processor) || (reason == REASON_SYSTEM) || support_bootcpu_shutdown); sched_processor_change_mode_locked(processor, PCM_ONLINE, true); assert(processor->processor_offline_state == PROCESSOR_OFFLINE_STARTING || processor->processor_offline_state == PROCESSOR_OFFLINE_STARTED_NOT_RUNNING || processor->processor_offline_state == PROCESSOR_OFFLINE_FINAL_SYSTEM_SLEEP); processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_STARTED_NOT_WAITED); ++pset->online_processor_count; pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); if (processor->is_recommended) { SCHED(pset_made_schedulable)(processor, pset, false); /* May relock the pset lock */ } pset_unlock(pset); smr_cpu_up(processor, SMR_CPU_REASON_OFFLINE); simple_unlock(&sched_available_cores_lock); splx(s); return is_first_online_processor; } void sched_mark_processor_offline(processor_t processor, bool is_final_system_sleep) { assert(processor == current_processor()); processor_set_t pset = processor->processor_set; spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); assert(bit_test(pcs.pcs_effective.pcs_online_cores, processor->cpu_id)); assert(processor->processor_offline_state == PROCESSOR_OFFLINE_BEGIN_SHUTDOWN); if (!is_final_system_sleep) { /* * We can't shut down the last available core! * Force recommend another CPU if this is the last one. */ if ((pcs.pcs_effective.pcs_online_cores & pcs.pcs_recommended_cores) == BIT(processor->cpu_id)) { sched_update_recommended_cores_locked(REASON_SYSTEM, BIT(processor->cpu_id)); } /* If we're still the last one, something went wrong. */ if ((pcs.pcs_effective.pcs_online_cores & pcs.pcs_recommended_cores) == BIT(processor->cpu_id)) { panic("shutting down the last available core! online: 0x%llx rec: 0x%llxx", pcs.pcs_effective.pcs_online_cores, pcs.pcs_recommended_cores); } } pset_lock(pset); assert(processor->state == PROCESSOR_RUNNING); assert(processor->processor_inshutdown); pset_update_processor_state(pset, processor, PROCESSOR_PENDING_OFFLINE); --pset->online_processor_count; sched_processor_change_mode_locked(processor, PCM_ONLINE, false); if (is_final_system_sleep) { assert3u(pcs.pcs_effective.pcs_online_cores, ==, 0); assert(processor == master_processor); assert(sched_all_cpus_offline()); processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_FINAL_SYSTEM_SLEEP); } else { processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_PENDING_OFFLINE); } simple_unlock(&sched_available_cores_lock); SCHED(processor_queue_shutdown)(processor); /* pset lock dropped */ SCHED(rt_queue_shutdown)(processor); splx(s); } /* * Apply a new recommended cores mask to the processors it affects * Runs after considering failsafes and such * * Iterate over processors and update their ->is_recommended field. * If a processor is running, we let it drain out at its next * quantum expiration or blocking point. If a processor is idle, there * may be more work for it to do, so IPI it. * * interrupts disabled, sched_available_cores_lock is held * * If a core is about to go offline, its bit will be set in core_going_offline, * so we can make sure not to pick it as the last resort cpu. */ static void sched_update_recommended_cores_locked(processor_reason_t reason, cpumap_t core_going_offline) { simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED); cpumap_t recommended_cores = pcs.pcs_requested_recommended_clpc; if (pcs.pcs_init_completed) { recommended_cores &= pcs.pcs_effective.pcs_powerdown_recommended_cores; } if (pcs.pcs_sleep_override_recommended || pcs.pcs_recommended_clpc_failsafe_active) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE, recommended_cores, sched_maintenance_thread->last_made_runnable_time, 0, 0, 0); recommended_cores = pcs.pcs_managed_cores; } if (bit_count(recommended_cores & pcs.pcs_effective.pcs_online_cores & ~core_going_offline) == 0) { /* * If there are no online cpus recommended, * then the system will make no forward progress. * Pick a CPU of last resort to avoid hanging. */ int last_resort; if (!support_bootcpu_shutdown) { /* We know the master_processor is always available */ last_resort = master_processor->cpu_id; } else { /* Pick some still-online processor to be the processor of last resort */ last_resort = lsb_first(pcs.pcs_effective.pcs_online_cores & ~core_going_offline); if (last_resort == -1) { panic("%s> no last resort cpu found: 0x%llx 0x%llx", __func__, pcs.pcs_effective.pcs_online_cores, core_going_offline); } } bit_set(recommended_cores, last_resort); } if (pcs.pcs_recommended_cores == recommended_cores) { /* Nothing to do */ return; } KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START, recommended_cores, pcs.pcs_recommended_clpc_failsafe_active, pcs.pcs_sleep_override_recommended, 0); cpumap_t needs_exit_idle_mask = 0x0; /* First set recommended cores */ foreach_node(node) { foreach_pset_id(pset_id, node) { processor_set_t pset = pset_array[pset_id]; cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask; cpumap_t newly_recommended = changed_recommendations & recommended_cores; if (newly_recommended == 0) { /* Nothing to do */ continue; } pset_lock(pset); cpumap_foreach(cpu_id, newly_recommended) { processor_t processor = processor_array[cpu_id]; sched_processor_change_mode_locked(processor, PCM_RECOMMENDED, true); processor->last_recommend_reason = reason; if (pset->recommended_bitmask == 0) { /* Cluster is becoming available for scheduling */ atomic_bit_set(&pset->node->pset_recommended_map, pset->pset_id, memory_order_relaxed); } bit_set(pset->recommended_bitmask, processor->cpu_id); if (processor->state == PROCESSOR_IDLE) { if (processor != current_processor()) { bit_set(needs_exit_idle_mask, processor->cpu_id); } } if (processor->processor_online) { SCHED(pset_made_schedulable)(processor, pset, false); /* May relock the pset lock */ } } pset_update_rt_stealable_state(pset); pset_unlock(pset); cpumap_foreach(cpu_id, newly_recommended) { smr_cpu_up(processor_array[cpu_id], SMR_CPU_REASON_IGNORED); } } } /* Now shutdown not recommended cores */ foreach_node(node) { foreach_pset_id(pset_id, node) { processor_set_t pset = pset_array[pset_id]; cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask; cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores; if (newly_unrecommended == 0) { /* Nothing to do */ continue; } cpumap_foreach(cpu_id, newly_unrecommended) { processor_t processor = processor_array[cpu_id]; sched_ipi_type_t ipi_type = SCHED_IPI_NONE; pset_lock(pset); sched_processor_change_mode_locked(processor, PCM_RECOMMENDED, false); if (reason != REASON_NONE) { processor->last_derecommend_reason = reason; } bit_clear(pset->recommended_bitmask, processor->cpu_id); pset_update_rt_stealable_state(pset); if (pset->recommended_bitmask == 0) { /* Cluster is becoming unavailable for scheduling */ atomic_bit_clear(&pset->node->pset_recommended_map, pset->pset_id, memory_order_relaxed); } if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) { ipi_type = SCHED_IPI_IMMEDIATE; } SCHED(processor_queue_shutdown)(processor); /* pset unlocked */ SCHED(rt_queue_shutdown)(processor); if (ipi_type == SCHED_IPI_NONE) { /* * If the core is idle, * we can directly mark the processor * as "Ignored" * * Otherwise, smr will detect this * during smr_cpu_leave() when the * processor actually idles. */ smr_cpu_down(processor, SMR_CPU_REASON_IGNORED); } else if (processor == current_processor()) { ast_on(AST_PREEMPT); } else { sched_ipi_perform(processor, ipi_type); } } } } if (pcs.pcs_init_completed) { assert3u(pcs.pcs_recommended_cores, ==, recommended_cores); } #if defined(__x86_64__) commpage_update_active_cpus(); #endif /* Issue all pending IPIs now that the pset lock has been dropped */ cpumap_foreach(cpu_id, needs_exit_idle_mask) { processor_t processor = processor_array[cpu_id]; machine_signal_idle(processor); } KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END, needs_exit_idle_mask, 0, 0, 0); } /* * Enters with the available cores lock held, returns with it held, but will drop it in the meantime. * Enters with the cluster_powerdown_lock held, returns with it held, keeps it held. */ static void sched_update_powered_cores_drops_lock(processor_reason_t requested_reason, spl_t caller_s) { lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED); simple_lock_assert(&sched_available_cores_lock, LCK_ASSERT_OWNED); assert(ml_get_interrupts_enabled() == false); assert(caller_s == true); /* Caller must have had interrupts enabled when they took the lock */ /* All transitions should be quiesced before we start changing things */ assert_no_processors_in_transition_locked(); pcs.pcs_in_flight_reason = requested_reason; struct powered_cores_state requested = sched_compute_requested_powered_cores(); struct powered_cores_state effective = pcs.pcs_effective; KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_START, requested.pcs_online_cores, requested_reason, 0, effective.pcs_online_cores); /* The bits that are different and in the new value */ cpumap_t newly_online_cores = (requested.pcs_online_cores ^ effective.pcs_online_cores) & requested.pcs_online_cores; /* The bits that are different and are not in the new value */ cpumap_t newly_offline_cores = (requested.pcs_online_cores ^ effective.pcs_online_cores) & ~requested.pcs_online_cores; cpumap_t newly_recommended_cores = (requested.pcs_powerdown_recommended_cores ^ effective.pcs_powerdown_recommended_cores) & requested.pcs_powerdown_recommended_cores; cpumap_t newly_derecommended_cores = (requested.pcs_powerdown_recommended_cores ^ effective.pcs_powerdown_recommended_cores) & ~requested.pcs_powerdown_recommended_cores; cpumap_t newly_temporary_cores = (requested.pcs_tempdown_cores ^ effective.pcs_tempdown_cores) & requested.pcs_tempdown_cores; cpumap_t newly_nontemporary_cores = (requested.pcs_tempdown_cores ^ effective.pcs_tempdown_cores) & ~requested.pcs_tempdown_cores; /* * Newly online and derecommended cores should be derecommended * before powering them up, so they never run around doing stuff * before we reach the end of this function. */ cpumap_t newly_online_and_derecommended = newly_online_cores & newly_derecommended_cores; /* * Publish the goal state we're working on achieving. * At the end of this function, pcs_effective will match this. */ pcs.pcs_requested = requested; pcs.pcs_effective.pcs_powerdown_recommended_cores |= newly_recommended_cores; pcs.pcs_effective.pcs_powerdown_recommended_cores &= ~newly_online_and_derecommended; sched_update_recommended_cores_locked(requested_reason, 0); simple_unlock(&sched_available_cores_lock); splx(caller_s); assert(ml_get_interrupts_enabled() == true); /* First set powered cores */ cpumap_t started_cores = 0ull; foreach_node(node) { foreach_pset_id(pset_id, node) { processor_set_t pset = pset_array[pset_id]; spl_t s = splsched(); pset_lock(pset); cpumap_t pset_newly_online = newly_online_cores & pset->cpu_bitmask; __assert_only cpumap_t pset_online_cores = pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]; assert((pset_online_cores & pset_newly_online) == 0); pset_unlock(pset); splx(s); if (pset_newly_online == 0) { /* Nothing to do */ continue; } cpumap_foreach(cpu_id, pset_newly_online) { processor_start_reason(processor_array[cpu_id], requested_reason); bit_set(started_cores, cpu_id); } } } /* * Wait for processors to finish starting in parallel. * We never proceed until all newly started processors have finished. * * This has the side effect of closing the ml_cpu_up_processors race, * as all started CPUs must have SIGPdisabled cleared by the time this * is satisfied. (rdar://124631843) */ cpumap_foreach(cpu_id, started_cores) { processor_wait_for_start(processor_array[cpu_id], PROCESSOR_POWERED_CORES_CHANGE); } /* * Update published counts of processors to match new temporary status * Publish all temporary before nontemporary, so that any readers that * see a middle state will see a slightly too high count instead of * ending up seeing a 0 (because that crashes dispatch_apply, ask * me how I know) */ spl_t s; s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); foreach_node(node) { foreach_pset_id(pset_id, node) { processor_set_t pset = pset_array[pset_id]; pset_lock(pset); cpumap_t pset_newly_temporary = newly_temporary_cores & pset->cpu_bitmask; cpumap_foreach(cpu_id, pset_newly_temporary) { sched_processor_change_mode_locked(processor_array[cpu_id], PCM_TEMPORARY, true); } pset_unlock(pset); } } foreach_node(node) { foreach_pset_id(pset_id, node) { processor_set_t pset = pset_array[pset_id]; pset_lock(pset); cpumap_t pset_newly_nontemporary = newly_nontemporary_cores & pset->cpu_bitmask; cpumap_foreach(cpu_id, pset_newly_nontemporary) { sched_processor_change_mode_locked(processor_array[cpu_id], PCM_TEMPORARY, false); } pset_unlock(pset); } } simple_unlock(&sched_available_cores_lock); splx(s); /* Now shutdown not powered cores */ foreach_node(node) { foreach_pset_id(pset_id, node) { processor_set_t pset = pset_array[pset_id]; s = splsched(); pset_lock(pset); cpumap_t pset_newly_offline = newly_offline_cores & pset->cpu_bitmask; __assert_only cpumap_t pset_powered_cores = pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]; assert((pset_powered_cores & pset_newly_offline) == pset_newly_offline); pset_unlock(pset); splx(s); if (pset_newly_offline == 0) { /* Nothing to do */ continue; } cpumap_foreach(cpu_id, pset_newly_offline) { processor_exit_reason(processor_array[cpu_id], requested_reason, false); } } } assert(ml_get_interrupts_enabled() == true); s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); assert(s == caller_s); pcs.pcs_effective.pcs_powerdown_recommended_cores &= ~newly_derecommended_cores; sched_update_recommended_cores_locked(requested_reason, 0); pcs.pcs_previous_reason = requested_reason; /* All transitions should be quiesced now that we are done changing things */ assert_no_processors_in_transition_locked(); assert3u(pcs.pcs_requested.pcs_online_cores, ==, pcs.pcs_effective.pcs_online_cores); assert3u(pcs.pcs_requested.pcs_tempdown_cores, ==, pcs.pcs_effective.pcs_tempdown_cores); assert3u(pcs.pcs_requested.pcs_powerdown_recommended_cores, ==, pcs.pcs_effective.pcs_powerdown_recommended_cores); KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_END, 0, 0, 0, 0); } void thread_set_options(uint32_t thopt) { spl_t x; thread_t t = current_thread(); x = splsched(); thread_lock(t); t->options |= thopt; thread_unlock(t); splx(x); } void thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint) { thread->pending_block_hint = block_hint; } uint32_t qos_max_parallelism(int qos, uint64_t options) { return SCHED(qos_max_parallelism)(qos, options); } uint32_t sched_qos_max_parallelism(__unused int qos, uint64_t options) { host_basic_info_data_t hinfo; mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; /* * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which * implement their own qos_max_parallelism() interfaces. */ assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0); /* Query the machine layer for core information */ __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO, (host_info_t)&hinfo, &count); assert(kret == KERN_SUCCESS); if (options & QOS_PARALLELISM_COUNT_LOGICAL) { return hinfo.logical_cpu; } else { return hinfo.physical_cpu; } } int sched_allow_NO_SMT_threads = 1; bool thread_no_smt(thread_t thread) { return sched_allow_NO_SMT_threads && (thread->bound_processor == PROCESSOR_NULL) && ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT)); } bool processor_active_thread_no_smt(processor_t processor) { return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT; } #if __arm64__ /* * Set up or replace old timer with new timer * * Returns true if canceled old timer, false if it did not */ boolean_t sched_perfcontrol_update_callback_deadline(uint64_t new_deadline) { /* * Exchange deadline for new deadline, if old deadline was nonzero, * then I cancelled the callback, otherwise I didn't */ return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline, relaxed) != 0; } /* * Set global SFI window (in usec) */ kern_return_t sched_perfcontrol_sfi_set_window(uint64_t window_usecs) { kern_return_t ret = KERN_NOT_SUPPORTED; #if CONFIG_THREAD_GROUPS if (window_usecs == 0ULL) { ret = sfi_window_cancel(); } else { ret = sfi_set_window(window_usecs); } #endif // CONFIG_THREAD_GROUPS return ret; } /* * Set background and maintenance SFI class offtimes */ kern_return_t sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs) { kern_return_t ret = KERN_NOT_SUPPORTED; #if CONFIG_THREAD_GROUPS if (offtime_usecs == 0ULL) { ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE); ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG); } else { ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs); ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs); } #endif // CONFIG_THREAD_GROUPS return ret; } /* * Set utility SFI class offtime */ kern_return_t sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs) { kern_return_t ret = KERN_NOT_SUPPORTED; #if CONFIG_THREAD_GROUPS if (offtime_usecs == 0ULL) { ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY); } else { ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs); } #endif // CONFIG_THREAD_GROUPS return ret; } #endif /* __arm64__ */ #if CONFIG_SCHED_EDGE #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u /* * sched_edge_pset_running_higher_bucket() * * Routine to calculate cumulative running counts for each scheduling * bucket. This effectively lets the load calculation calculate if a * cluster is running any threads at a QoS lower than the thread being * migrated etc. */ static void sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher) { bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING]; bzero(running_higher, sizeof(uint32_t) * TH_BUCKET_SCHED_MAX); /* Count the running threads per bucket */ for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) { sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed); /* Don't count idle threads */ if (cpu_bucket < TH_BUCKET_SCHED_MAX) { running_higher[cpu_bucket]++; } } /* Calculate the cumulative running counts as a prefix sum */ for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX - 1; bucket++) { running_higher[bucket + 1] += running_higher[bucket]; } } /* * sched_update_pset_load_average() * * Updates the load average for each sched bucket for a cluster. * This routine must be called with the pset lock held. */ void sched_update_pset_load_average(processor_set_t pset, uint64_t curtime) { int avail_cpu_count = pset_available_cpu_count(pset); if (avail_cpu_count == 0) { /* Looks like the pset is not runnable any more; nothing to do here */ return; } /* * Edge Scheduler Optimization * * See if more callers of this routine can pass in timestamps to avoid the * mach_absolute_time() call here. */ if (!curtime) { curtime = mach_absolute_time(); } uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed); int64_t delta_ticks = curtime - last_update; if (delta_ticks < 0) { return; } uint64_t delta_nsecs = 0; absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs); if (__improbable(delta_nsecs > UINT32_MAX)) { delta_nsecs = UINT32_MAX; } /* Update the shared resource load on the pset */ for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) { uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type); uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]); uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load; uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed); if (old_shared_load != new_shared_load) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load); } } uint32_t running_higher[TH_BUCKET_SCHED_MAX]; sched_edge_pset_running_higher_bucket(pset, running_higher); for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) { uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed); uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS; uint32_t current_runq_depth = sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]; os_atomic_store(&pset->pset_runnable_depth[sched_bucket], current_runq_depth, relaxed); uint32_t current_load = current_runq_depth / avail_cpu_count; /* * For the new load average multiply current_load by delta_nsecs (which results in a 32.0 value). * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the * new load average needs to be shifted before it can be added to the old load average. */ uint64_t new_load_average_factor = (current_load * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS; /* * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA * when the system is already loaded; otherwise for an idle system use the latest load average immediately. */ int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS); boolean_t load_uptick = (old_load_shifted == 0) && (current_load != 0); boolean_t load_downtick = (old_load_shifted != 0) && (current_load == 0); uint64_t load_average; if (load_uptick || load_downtick) { load_average = (current_load << SCHED_PSET_LOAD_EWMA_FRACTION_BITS); } else { /* Indicates a loaded system; use EWMA for load average calculation */ load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS); } os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed); if (load_average != old_load_average) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket); } } os_atomic_store(&pset->pset_load_last_update, curtime, relaxed); } void sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket) { pset_execution_time_t old_execution_time_packed, new_execution_time_packed; uint64_t avg_thread_execution_time = 0; os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed, old_execution_time_packed.pset_execution_time_packed, new_execution_time_packed.pset_execution_time_packed, relaxed, { uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update; int64_t delta_ticks = curtime - last_update; if (delta_ticks <= 0) { /* * Its possible that another CPU came in and updated the pset_execution_time * before this CPU could do it. Since the average execution time is meant to * be an approximate measure per cluster, ignore the older update. */ os_atomic_rmw_loop_give_up(return ); } uint64_t delta_nsecs = 0; absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs); uint64_t nanotime = 0; absolutetime_to_nanoseconds(execution_time, &nanotime); uint64_t execution_time_us = nanotime / NSEC_PER_USEC; /* * Since the average execution time is stored in microseconds, avoid rounding errors in * the EWMA calculation by only using a non-zero previous value. */ uint64_t old_avg_thread_execution_time = MAX(old_execution_time_packed.pset_avg_thread_execution_time, 1ULL); uint64_t old_execution_time = (old_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS); uint64_t new_execution_time = (execution_time_us * delta_nsecs); avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS); new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time; new_execution_time_packed.pset_execution_time_last_update = curtime; }); if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket); } } uint64_t sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type) { /* Prevent migrations to derecommended clusters */ if (!pset_is_recommended(pset)) { return UINT64_MAX; } return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed); } #else /* CONFIG_SCHED_EDGE */ void sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime) { int non_rt_load = pset->pset_runq.count; int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT); int new_load_average = ((int)pset->load_average + load) >> 1; pset->load_average = new_load_average; #if (DEVELOPMENT || DEBUG) #if __AMP__ if (pset->pset_cluster_type == PSET_AMP_P) { KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset))); } #endif #endif } void sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket) { } #endif /* CONFIG_SCHED_EDGE */ /* pset is locked */ static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor) { int cpuid = processor->cpu_id; #if defined(__x86_64__) if (sched_avoid_cpu0 && (cpuid == 0)) { return false; } #endif cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map; return bit_test(fasttrack_map, cpuid); } /* pset is locked */ static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills) { #if defined(__x86_64__) bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0); #else const bool avoid_cpu0 = false; #endif cpumap_t cpu_map; try_again: cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map; if (skip_processor) { bit_clear(cpu_map, skip_processor->cpu_id); } if (skip_spills) { cpu_map &= ~pset->rt_pending_spill_cpu_mask; } if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { bit_clear(cpu_map, 0); } cpumap_t primary_map = cpu_map & pset->primary_map; if (avoid_cpu0) { primary_map = bit_ror64(primary_map, 1); } int rotid = lsb_first(primary_map); if (rotid >= 0) { int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid; processor_t processor = processor_array[cpuid]; return processor; } if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) { goto out; } if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { /* Also avoid cpu1 */ bit_clear(cpu_map, 1); } /* Consider secondary processors whose primary is actually running a realtime thread */ cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1); if (avoid_cpu0) { /* Also avoid cpu1 */ secondary_map = bit_ror64(secondary_map, 2); } rotid = lsb_first(secondary_map); if (rotid >= 0) { int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid; processor_t processor = processor_array[cpuid]; return processor; } /* Consider secondary processors */ secondary_map = cpu_map & ~pset->primary_map; if (avoid_cpu0) { /* Also avoid cpu1 */ secondary_map = bit_ror64(secondary_map, 2); } rotid = lsb_first(secondary_map); if (rotid >= 0) { int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid; processor_t processor = processor_array[cpuid]; return processor; } /* * I was hoping the compiler would optimize * this away when avoid_cpu0 is const bool false * but it still complains about the assignmnent * in that case. */ if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { #if defined(__x86_64__) avoid_cpu0 = false; #else assert(0); #endif goto try_again; } out: if (skip_processor) { return PROCESSOR_NULL; } /* * If we didn't find an obvious processor to choose, but there are still more CPUs * not already running realtime threads than realtime threads in the realtime run queue, * this thread belongs in this pset, so choose some other processor in this pset * to ensure the thread is enqueued here. */ cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map; if (bit_count(non_realtime_map) > rt_runq_count(pset)) { cpu_map = non_realtime_map; assert(cpu_map != 0); int cpuid = bit_first(cpu_map); assert(cpuid >= 0); return processor_array[cpuid]; } if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) { goto skip_secondaries; } non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map; if (bit_count(non_realtime_map) > rt_runq_count(pset)) { cpu_map = non_realtime_map; assert(cpu_map != 0); int cpuid = bit_first(cpu_map); assert(cpuid >= 0); return processor_array[cpuid]; } skip_secondaries: return PROCESSOR_NULL; } /* * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority. * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline. * * pset is locked. */ static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus) { uint64_t furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon); processor_t fd_processor = PROCESSOR_NULL; int lowest_priority = max_pri; cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask; if (skip_processor) { bit_clear(cpu_map, skip_processor->cpu_id); } if (skip_spills) { cpu_map &= ~pset->rt_pending_spill_cpu_mask; } for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) { processor_t processor = processor_array[cpuid]; if (processor->current_pri > lowest_priority) { continue; } if (processor->current_pri < lowest_priority) { lowest_priority = processor->current_pri; furthest_deadline = processor->deadline; fd_processor = processor; continue; } if (processor->deadline > furthest_deadline) { furthest_deadline = processor->deadline; fd_processor = processor; } } if (fd_processor) { return fd_processor; } /* * There is a race condition possible when there are multiple processor sets. * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU, * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads, * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling. * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue. * But even if we chose the better run queue, we still wouldn't send an IPI in this case.) * * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads * on the run queue of that pset. */ if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) { cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask; assert(skip_processor == PROCESSOR_NULL); assert(skip_spills == false); for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) { processor_t processor = processor_array[cpuid]; if (processor->current_pri > lowest_priority) { continue; } if (processor->current_pri < lowest_priority) { lowest_priority = processor->current_pri; furthest_deadline = processor->deadline; fd_processor = processor; continue; } if (processor->deadline > furthest_deadline) { furthest_deadline = processor->deadline; fd_processor = processor; } } } return fd_processor; } /* pset is locked */ static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries) { bool skip_spills = true; bool include_ast_urgent_pending_cpus = false; processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills); if (next_processor != PROCESSOR_NULL) { return next_processor; } next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus); return next_processor; } #if defined(__x86_64__) /* pset is locked */ static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups) { bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0); int nbackup_cpus = 0; if (include_backups && rt_runq_is_low_latency(pset)) { nbackup_cpus = sched_rt_n_backup_processors; } cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map; if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { bit_clear(cpu_map, 0); } return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map); } /* pset is locked */ static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups) { int nbackup_cpus = 0; if (include_backups && rt_runq_is_low_latency(pset)) { nbackup_cpus = sched_rt_n_backup_processors; } cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map; return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map); } #endif static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup) { if (!processor->is_recommended) { return false; } bool ok_to_run_realtime_thread = true; #if defined(__x86_64__) bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id); if (spill_pending) { return true; } if (processor->cpu_id == 0) { if (sched_avoid_cpu0 == 1) { ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup); } else if (sched_avoid_cpu0 == 2) { ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup); } } else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) { ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup); } else if (processor->processor_primary != processor) { ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup)); } #else (void)pset; (void)processor; (void)as_backup; #endif return ok_to_run_realtime_thread; } void sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock) { if (drop_lock) { pset_unlock(pset); } } void thread_set_no_smt(bool set) { if (!system_is_SMT) { /* Not a machine that supports SMT */ return; } thread_t thread = current_thread(); spl_t s = splsched(); thread_lock(thread); if (set) { thread->sched_flags |= TH_SFLAG_NO_SMT; } thread_unlock(thread); splx(s); } bool thread_get_no_smt(void) { return current_thread()->sched_flags & TH_SFLAG_NO_SMT; } extern void task_set_no_smt(task_t); void task_set_no_smt(task_t task) { if (!system_is_SMT) { /* Not a machine that supports SMT */ return; } if (task == TASK_NULL) { task = current_task(); } task_lock(task); task->t_flags |= TF_NO_SMT; task_unlock(task); } #if DEBUG || DEVELOPMENT extern void sysctl_task_set_no_smt(char no_smt); void sysctl_task_set_no_smt(char no_smt) { if (!system_is_SMT) { /* Not a machine that supports SMT */ return; } task_t task = current_task(); task_lock(task); if (no_smt == '1') { task->t_flags |= TF_NO_SMT; } task_unlock(task); } extern char sysctl_task_get_no_smt(void); char sysctl_task_get_no_smt(void) { task_t task = current_task(); if (task->t_flags & TF_NO_SMT) { return '1'; } return '0'; } #endif /* DEVELOPMENT || DEBUG */ __private_extern__ void thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound) { #if __AMP__ spl_t s = splsched(); thread_lock(thread); thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT); thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE; if (soft_bound) { thread->sched_flags |= TH_SFLAG_BOUND_SOFT; } pset_node_t bind_node = PSET_NODE_NULL; switch (cluster_type) { case 'e': case 'E': if (ecore_node->psets != PROCESSOR_SET_NULL) { bind_node = ecore_node; } break; case 'p': case 'P': if (pcore_node->psets != PROCESSOR_SET_NULL) { bind_node = pcore_node; } break; default: break; } if (bind_node != PSET_NODE_NULL) { thread->th_bound_cluster_id = bind_node->psets->pset_id; } thread_unlock(thread); splx(s); if (thread == current_thread()) { thread_block(THREAD_CONTINUE_NULL); } #else /* __AMP__ */ (void)thread; (void)cluster_type; (void)soft_bound; #endif /* __AMP__ */ } extern uint32_t thread_bound_cluster_id(thread_t thread); uint32_t thread_bound_cluster_id(thread_t thread) { return thread->th_bound_cluster_id; } __private_extern__ kern_return_t thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options) { #if __AMP__ processor_set_t pset = NULL; /* Treat binding to THREAD_BOUND_CLUSTER_NONE as a request to unbind. */ if ((options & THREAD_UNBIND) || cluster_id == THREAD_BOUND_CLUSTER_NONE) { /* If the thread was actually not bound to some cluster, nothing to do here */ if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) { return KERN_SUCCESS; } } else { /* Validate the inputs for the bind case */ int max_clusters = ml_get_cluster_count(); if (cluster_id >= max_clusters) { /* Invalid cluster id */ return KERN_INVALID_VALUE; } pset = pset_array[cluster_id]; if (pset == NULL) { /* Cluster has not been initialized yet */ return KERN_INVALID_VALUE; } if (options & THREAD_BIND_ELIGIBLE_ONLY) { if (SCHED(thread_eligible_for_pset(thread, pset)) == false) { /* Thread is not recommended for the cluster type */ return KERN_INVALID_POLICY; } } } spl_t s = splsched(); thread_lock(thread); /* Unbind the thread from its previous bound state */ thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT); thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE; if (options & THREAD_UNBIND) { /* Nothing more to do here */ goto thread_bind_cluster_complete; } if (options & THREAD_BIND_SOFT) { thread->sched_flags |= TH_SFLAG_BOUND_SOFT; } thread->th_bound_cluster_id = cluster_id; thread_bind_cluster_complete: thread_unlock(thread); splx(s); if (thread == current_thread()) { thread_block(THREAD_CONTINUE_NULL); } #else /* __AMP__ */ (void)thread; (void)cluster_id; (void)options; #endif /* __AMP__ */ return KERN_SUCCESS; } #if DEVELOPMENT || DEBUG extern int32_t sysctl_get_bound_cpuid(void); int32_t sysctl_get_bound_cpuid(void) { int32_t cpuid = -1; thread_t self = current_thread(); processor_t processor = self->bound_processor; if (processor == NULL) { cpuid = -1; } else { cpuid = processor->cpu_id; } return cpuid; } extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid); kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid) { processor_t processor = PROCESSOR_NULL; if (cpuid == -1) { goto unbind; } if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) { return KERN_INVALID_VALUE; } processor = processor_array[cpuid]; if (processor == PROCESSOR_NULL) { return KERN_INVALID_VALUE; } #if __AMP__ thread_t thread = current_thread(); if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) { if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) { /* Cannot hard-bind an already hard-cluster-bound thread */ return KERN_NOT_SUPPORTED; } } #endif /* __AMP__ */ unbind: thread_bind(processor); thread_block(THREAD_CONTINUE_NULL); return KERN_SUCCESS; } #if __AMP__ static char pset_cluster_type_name_char(pset_cluster_type_t pset_type) { switch (pset_type) { case PSET_AMP_E: return 'E'; case PSET_AMP_P: return 'P'; default: panic("Unexpected AMP pset cluster type %d", pset_type); } } #endif /* __AMP__ */ extern char sysctl_get_task_cluster_type(void); char sysctl_get_task_cluster_type(void) { #if __AMP__ task_t task = current_task(); processor_set_t pset_hint = task->pset_hint; if (!pset_hint) { return '0'; } return pset_cluster_type_name_char(pset_hint->pset_cluster_type); #else /* !__AMP__ */ return '0'; #endif /* __AMP__ */ } #if __AMP__ extern char sysctl_get_bound_cluster_type(void); char sysctl_get_bound_cluster_type(void) { thread_t self = current_thread(); if (self->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) { return '0'; } pset_cluster_type_t pset_type = pset_array[self->th_bound_cluster_id]->pset_cluster_type; return pset_cluster_type_name_char(pset_type); } static processor_set_t find_pset_of_type(pset_cluster_type_t t) { for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) { if (node->pset_cluster_type != t) { continue; } processor_set_t pset = PROCESSOR_SET_NULL; for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) { pset = pset_array[pset_id]; /* Prefer one with recommended processsors */ if (pset_is_recommended(pset)) { assert(pset->pset_cluster_type == t); return pset; } } /* Otherwise return whatever was found last */ return pset; } return PROCESSOR_SET_NULL; } #endif /* __AMP__ */ extern void sysctl_task_set_cluster_type(char cluster_type); void sysctl_task_set_cluster_type(char cluster_type) { task_t task = current_task(); processor_set_t pset_hint = PROCESSOR_SET_NULL; #if __AMP__ switch (cluster_type) { case 'e': case 'E': pset_hint = find_pset_of_type(PSET_AMP_E); break; case 'p': case 'P': pset_hint = find_pset_of_type(PSET_AMP_P); break; default: break; } if (pset_hint) { task_lock(task); task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE; task->pset_hint = pset_hint; task_unlock(task); thread_block(THREAD_CONTINUE_NULL); } #else (void)cluster_type; (void)task; (void)pset_hint; #endif } /* * The quantum length used for Fixed and RT sched modes. In general the quantum * can vary - for example for background or QOS. */ extern uint64_t sysctl_get_quantum_us(void); uint64_t sysctl_get_quantum_us(void) { uint32_t quantum; uint64_t quantum_ns; quantum = SCHED(initial_quantum_size)(THREAD_NULL); absolutetime_to_nanoseconds(quantum, &quantum_ns); return quantum_ns / 1000; } #endif /* DEVELOPMENT || DEBUG */