/* * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ /* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ /* */ /* * File: kern/machine.c * Author: Avadis Tevanian, Jr. * Date: 1987 * * Support for machine independent machine abstraction. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if ML_IO_TIMEOUTS_ENABLED #include #endif #include #include #include #if HIBERNATION #include #endif #include #if CONFIG_DTRACE extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t); #endif #if defined(__arm64__) extern void wait_while_mp_kdp_trap(bool check_SIGPdebug); #if CONFIG_SPTM #include #else #include #endif /* CONFIG_SPTM */ #endif /* defined(__arm64__) */ #if defined(__x86_64__) #include #endif /* * Exported variables: */ TUNABLE(long, wdt, "wdt", 0); struct machine_info machine_info; /* Forwards */ static void processor_offline(void * parameter, __unused wait_result_t result); static void processor_offline_intstack(processor_t processor) __dead2; /* * processor_up: * * Flag processor as up and running, and available * for scheduling. */ void processor_up( processor_t processor) { spl_t s = splsched(); init_ast_check(processor); #if defined(__arm64__) /* * A processor coming online won't have received a SIGPdebug signal * to cause it to spin while a stackshot or panic is taking place, * so spin here on mp_kdp_trap. * * However, since cpu_signal() is not yet enabled for this processor, * there is a race if we have just passed this when a cpu_signal() * is attempted. The sender will assume the cpu is offline, so it will * not end up spinning anywhere. See processor_cpu_reinit() for the fix * for this race. */ wait_while_mp_kdp_trap(false); #endif /* Boot CPU coming online for the first time, either at boot or after sleep */ __assert_only bool is_first_online_processor; is_first_online_processor = sched_mark_processor_online(processor, processor->last_startup_reason); simple_lock(&processor_start_state_lock, LCK_GRP_NULL); assert(processor->processor_instartup == true || is_first_online_processor); simple_unlock(&processor_start_state_lock); splx(s); #if defined(__x86_64__) ml_cpu_up(); #endif /* defined(__x86_64__) */ #if CONFIG_DTRACE if (dtrace_cpu_state_changed_hook) { (*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE); } #endif } #include kern_return_t host_reboot( host_priv_t host_priv, int options) { if (host_priv == HOST_PRIV_NULL) { return KERN_INVALID_HOST; } #if DEVELOPMENT || DEBUG if (options & HOST_REBOOT_DEBUGGER) { Debugger("Debugger"); return KERN_SUCCESS; } #endif if (options & HOST_REBOOT_UPSDELAY) { // UPS power cutoff path PEHaltRestart( kPEUPSDelayHaltCPU ); } else { halt_all_cpus(!(options & HOST_REBOOT_HALT)); } return KERN_SUCCESS; } kern_return_t processor_assign( __unused processor_t processor, __unused processor_set_t new_pset, __unused boolean_t wait) { return KERN_FAILURE; } void processor_doshutdown( processor_t processor, bool is_final_system_sleep) { lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED); lck_mtx_assert(&processor_updown_lock, LCK_MTX_ASSERT_OWNED); if (!processor->processor_booted) { panic("processor %d not booted", processor->cpu_id); } if (is_final_system_sleep) { assert(processor == current_processor()); assert(processor == master_processor); assert(processor_avail_count == 1); } processor_set_t pset = processor->processor_set; ml_cpu_begin_state_transition(processor->cpu_id); ml_broadcast_cpu_event(CPU_EXIT_REQUESTED, processor->cpu_id); #if HIBERNATION if (is_final_system_sleep) { /* * Ensure the page queues are in a state where the hibernation * code can manipulate them without requiring other threads * to be scheduled. * * This operation can block, * and unlock must be done from the same thread. */ assert(processor_avail_count < 2); hibernate_vm_lock(); } #endif spl_t s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); pset_lock(pset); assert(processor->state != PROCESSOR_START); assert(processor->state != PROCESSOR_PENDING_OFFLINE); assert(processor->state != PROCESSOR_OFF_LINE); assert(!processor->processor_inshutdown); processor->processor_inshutdown = true; assert(processor->processor_offline_state == PROCESSOR_OFFLINE_RUNNING); processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_BEGIN_SHUTDOWN); if (!is_final_system_sleep) { sched_assert_not_last_online_cpu(processor->cpu_id); } pset_unlock(pset); simple_unlock(&sched_available_cores_lock); if (is_final_system_sleep) { assert(processor == current_processor()); #if HIBERNATION /* * After this point, the system is now * committed to hibernation and must * not run any other thread that could take this lock. */ hibernate_vm_unlock(); #endif } else { /* * Get onto the processor to shut down. * The scheduler picks this thread naturally according to its * priority. * The processor can run any other thread if this one blocks. * So, don't block. */ processor_t prev = thread_bind(processor); thread_block(THREAD_CONTINUE_NULL); /* interrupts still disabled */ assert(ml_get_interrupts_enabled() == FALSE); assert(processor == current_processor()); assert(processor->processor_inshutdown); thread_bind(prev); /* interrupts still disabled */ } /* * Continue processor shutdown on the processor's idle thread. * The handoff won't fail because the idle thread has a reserved stack. * Switching to the idle thread leaves interrupts disabled, * so we can't accidentally take an interrupt after the context switch. */ thread_t shutdown_thread = processor->idle_thread; shutdown_thread->continuation = processor_offline; shutdown_thread->parameter = (void*)is_final_system_sleep; thread_run(current_thread(), THREAD_CONTINUE_NULL, NULL, shutdown_thread); /* * After this point, we are in regular scheduled context on a remaining * available CPU. Interrupts are still disabled. */ if (is_final_system_sleep) { /* * We are coming out of system sleep here, so there won't be a * corresponding processor_startup for this processor, so we * need to put it back in the correct running state. * * There's nowhere to execute a call to CPU_EXITED during system * sleep for the boot processor, and it's already been CPU_BOOTED * by this point anyways, so skip the call. */ assert(current_processor() == master_processor); assert(processor->state == PROCESSOR_RUNNING); assert(processor->processor_inshutdown); assert(processor->processor_offline_state == PROCESSOR_OFFLINE_STARTED_NOT_WAITED); processor->processor_inshutdown = false; processor_update_offline_state(processor, PROCESSOR_OFFLINE_RUNNING); splx(s); } else { splx(s); cpu_exit_wait(processor->cpu_id); s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); pset_lock(pset); assert(processor->processor_inshutdown); assert(processor->processor_offline_state == PROCESSOR_OFFLINE_PENDING_OFFLINE); assert(processor->state == PROCESSOR_PENDING_OFFLINE); pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE); processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_CPU_OFFLINE); pset_unlock(pset); simple_unlock(&sched_available_cores_lock); splx(s); ml_broadcast_cpu_event(CPU_EXITED, processor->cpu_id); ml_cpu_power_disable(processor->cpu_id); assert(processor->processor_offline_state == PROCESSOR_OFFLINE_CPU_OFFLINE); processor_update_offline_state(processor, PROCESSOR_OFFLINE_FULLY_OFFLINE); } ml_cpu_end_state_transition(processor->cpu_id); } /* * Called in the context of the idle thread to shut down the processor * * A shut-down processor looks like it's 'running' the idle thread parked * in this routine, but it's actually been powered off and has no hardware state. */ static void processor_offline( void * parameter, __unused wait_result_t result) { bool is_final_system_sleep = (bool) parameter; processor_t processor = current_processor(); thread_t self = current_thread(); __assert_only thread_t old_thread = THREAD_NULL; assert(self->state & TH_IDLE); assert(processor->idle_thread == self); assert(ml_get_interrupts_enabled() == FALSE); assert(self->continuation == NULL); assert(processor->processor_online == true); assert(processor->running_timers_active == false); if (is_final_system_sleep) { assert(processor == current_processor()); assert(processor == master_processor); assert(processor_avail_count == 1); } KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROCESSOR_SHUTDOWN) | DBG_FUNC_START, processor->cpu_id); bool enforce_quiesce_safety = gEnforcePlatformActionSafety; /* * Scheduling is now disabled for this processor. * Ensure that primitives that need scheduling (like mutexes) know this. */ if (enforce_quiesce_safety) { disable_preemption_without_measurements(); } #if CONFIG_DTRACE if (dtrace_cpu_state_changed_hook) { (*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE); } #endif smr_cpu_down(processor, SMR_CPU_REASON_OFFLINE); /* Drain pending IPIs for the last time here. */ ml_cpu_down(); sched_mark_processor_offline(processor, is_final_system_sleep); /* * Switch to the interrupt stack and shut down the processor. * * When the processor comes back, it will eventually call load_context which * restores the context saved by machine_processor_shutdown, returning here. */ old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor); /* * The processor is back. sched_mark_processor_online and * friends have already run via processor_up. */ /* old_thread should be NULL because we got here through Load_context */ assert(old_thread == THREAD_NULL); assert(processor == current_processor()); assert(processor->idle_thread == current_thread()); assert(processor->processor_online == true); assert(ml_get_interrupts_enabled() == FALSE); assert(self->continuation == NULL); /* Extract the machine_param value stashed by secondary_cpu_main */ void * machine_param = self->parameter; self->parameter = NULL; processor_cpu_reinit(machine_param, true, is_final_system_sleep); if (enforce_quiesce_safety) { enable_preemption(); } KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROCESSOR_SHUTDOWN) | DBG_FUNC_END, processor->cpu_id); /* * Now that the processor is back, invoke the idle thread to find out what to do next. * idle_thread will enable interrupts. */ thread_block(idle_thread); /*NOTREACHED*/ } /* * Complete the shutdown and place the processor offline. * * Called at splsched in the shutdown context * (i.e. on the idle thread, on the interrupt stack) * * The onlining half of this is done in load_context(). */ static void processor_offline_intstack( processor_t processor) { assert(processor == current_processor()); assert(processor->active_thread == current_thread()); struct recount_snap snap = { 0 }; recount_snapshot(&snap); recount_processor_idle(&processor->pr_recount, &snap); smr_cpu_leave(processor, processor->last_dispatch); PMAP_DEACTIVATE_KERNEL(processor->cpu_id); cpu_sleep(); panic("zombie processor"); /*NOTREACHED*/ } /* * Called on the idle thread with interrupts disabled to initialize a * secondary processor on boot or to reinitialize any processor on resume * from processor offline. */ void processor_cpu_reinit(void* machine_param, __unused bool wait_for_cpu_signal, __assert_only bool is_final_system_sleep) { /* Re-initialize the processor */ machine_cpu_reinit(machine_param); #if defined(__arm64__) /* * See the comments for wait_while_mp_kdp_trap in processor_up(). * * SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor) * the first time we take an IPI. This is triggered by machine_cpu_reinit(), above, * which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends * a self-IPI to ensure that happens when we enable interrupts. So enable interrupts * here so that cpu_signal() can succeed before we spin on mp_kdp_trap. */ assert_ml_cpu_signal_is_enabled(false); ml_set_interrupts_enabled(TRUE); if (wait_for_cpu_signal) { ml_wait_for_cpu_signal_to_enable(); } ml_set_interrupts_enabled(FALSE); wait_while_mp_kdp_trap(true); /* * At this point, * if a stackshot or panic is in progress, we either spin on mp_kdp_trap * or we sucessfully received a SIGPdebug signal which will cause us to * break out of the spin on mp_kdp_trap and instead * spin next time interrupts are enabled in idle_thread(). */ if (wait_for_cpu_signal) { assert_ml_cpu_signal_is_enabled(true); } /* * Now that we know SIGPdisabled is cleared, we can publish that * this CPU has fully come out of offline state. * * Without wait_for_cpu_signal, we'll publish this earlier than * cpu_signal is actually ready, but as long as it's ready by next S2R, * it will be good enough. */ ml_cpu_up(); #endif /* * Interrupts must be disabled while processor_start_state_lock is * held to prevent a deadlock with CPU startup of other CPUs that * may be proceeding in parallel to this CPU's reinitialization. */ spl_t s = splsched(); processor_t processor = current_processor(); simple_lock(&processor_start_state_lock, LCK_GRP_NULL); assert(processor->processor_instartup == true || is_final_system_sleep); processor->processor_instartup = false; simple_unlock(&processor_start_state_lock); splx(s); thread_wakeup((event_t)&processor->processor_instartup); } kern_return_t host_get_boot_info( host_priv_t host_priv, kernel_boot_info_t boot_info) { const char *src = ""; if (host_priv == HOST_PRIV_NULL) { return KERN_INVALID_HOST; } /* * Copy first operator string terminated by '\0' followed by * standardized strings generated from boot string. */ src = machine_boot_info(boot_info, KERNEL_BOOT_INFO_MAX); if (src != boot_info) { (void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX); } return KERN_SUCCESS; } // These are configured through sysctls. #if DEVELOPMENT || DEBUG uint32_t phy_read_panic = 1; uint32_t phy_write_panic = 1; uint64_t simulate_stretched_io = 0; #else uint32_t phy_read_panic = 0; uint32_t phy_write_panic = 0; #endif #if ML_IO_TIMEOUTS_ENABLED mmio_track_t PERCPU_DATA(mmio_tracker); #endif #if !defined(__x86_64__) #if DEVELOPMENT || DEBUG static const uint64_t TIMEBASE_TICKS_PER_USEC = 24000000ULL / USEC_PER_SEC; static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 100 * TIMEBASE_TICKS_PER_USEC; #else static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 0; #endif // The MACHINE_TIMEOUT facility only exists on ARM. MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL); MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL); MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL); MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL); #if SCHED_HYGIENE_DEBUG /* * Note: The interrupt-masked timeout goes through two initializations - one * early in boot and one later. Thus this function is also called twice and * can't be marked '__startup_func'. */ static void ml_io_init_timeouts(void) { /* * The timeouts may be completely disabled via an override. */ if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) { os_atomic_store(&report_phy_write_delay_to, 0, relaxed); os_atomic_store(&report_phy_read_delay_to, 0, relaxed); return; } /* * There may be no interrupt masked timeout set. */ const uint64_t interrupt_masked_to = os_atomic_load(&interrupt_masked_timeout, relaxed); if (interrupt_masked_timeout == 0) { return; } /* * Inherit from the interrupt masked timeout if smaller and the timeout * hasn't been explicitly set via boot-arg. */ uint64_t arg = 0; if (!PE_parse_boot_argn("ml-timeout-report-phy-read-delay", &arg, sizeof(arg))) { uint64_t report_phy_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed); report_phy_read_delay = report_phy_read_delay == 0 ? interrupt_masked_to : MIN(report_phy_read_delay, interrupt_masked_to); os_atomic_store(&report_phy_read_delay_to, report_phy_read_delay, relaxed); } if (!PE_parse_boot_argn("ml-timeout-report-phy-write-delay", &arg, sizeof(arg))) { uint64_t report_phy_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed); report_phy_write_delay = report_phy_write_delay == 0 ? interrupt_masked_to : MIN(report_phy_write_delay, interrupt_masked_to); os_atomic_store(&report_phy_write_delay_to, report_phy_write_delay, relaxed); } } /* * It's important that this happens after machine timeouts have initialized so * the correct timeouts can be inherited. */ STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts); #endif /* SCHED_HYGIENE_DEBUG */ extern pmap_paddr_t kvtophys(vm_offset_t va); #endif /* !defined(__x86_64__) */ #if ML_IO_TIMEOUTS_ENABLED static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override"); static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp); struct io_timeout_override_entry { RB_ENTRY(io_timeout_override_entry) tree; uintptr_t ioaddr_base; unsigned int size; uint32_t read_timeout; uint32_t write_timeout; }; static inline int io_timeout_override_cmp(const struct io_timeout_override_entry *a, const struct io_timeout_override_entry *b) { if (a->ioaddr_base < b->ioaddr_base) { return -1; } else if (a->ioaddr_base > b->ioaddr_base) { return 1; } else { return 0; } } static RB_HEAD(io_timeout_override, io_timeout_override_entry) io_timeout_override_root_pa, io_timeout_override_root_va; RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp); RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp); static int io_increase_timeouts(struct io_timeout_override *root, uintptr_t ioaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us) { const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX; assert(preemption_enabled()); int ret = KERN_SUCCESS; if (size == 0) { return KERN_INVALID_ARGUMENT; } uintptr_t ioaddr_end; if (os_add_overflow(ioaddr_base, size - 1, &ioaddr_end)) { return KERN_INVALID_ARGUMENT; } uint64_t read_timeout_abs, write_timeout_abs; nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, &read_timeout_abs); nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, &write_timeout_abs); if (read_timeout_abs > MAX_TIMEOUT_ABS || write_timeout_abs > MAX_TIMEOUT_ABS) { return KERN_INVALID_ARGUMENT; } struct io_timeout_override_entry *node = kalloc_type(struct io_timeout_override_entry, Z_WAITOK | Z_ZERO | Z_NOFAIL); node->ioaddr_base = ioaddr_base; node->size = size; node->read_timeout = (uint32_t)read_timeout_abs; node->write_timeout = (uint32_t)write_timeout_abs; /* * Interrupt handlers are allowed to call ml_io_{read,write}*, so * interrupts must be disabled any time io_timeout_override_lock is * held. Otherwise the CPU could take an interrupt while holding the * lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock * trying to acquire the lock again. */ boolean_t istate = ml_set_interrupts_enabled(FALSE); lck_spin_lock(&io_timeout_override_lock); if (RB_INSERT(io_timeout_override, root, node)) { ret = KERN_INVALID_ARGUMENT; goto out; } /* Check that this didn't create any new overlaps */ struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, root, node); if (prev && (prev->ioaddr_base + prev->size) > node->ioaddr_base) { RB_REMOVE(io_timeout_override, root, node); ret = KERN_INVALID_ARGUMENT; goto out; } struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, root, node); if (next && (node->ioaddr_base + node->size) > next->ioaddr_base) { RB_REMOVE(io_timeout_override, root, node); ret = KERN_INVALID_ARGUMENT; goto out; } out: lck_spin_unlock(&io_timeout_override_lock); ml_set_interrupts_enabled(istate); if (ret != KERN_SUCCESS) { kfree_type(struct io_timeout_override_entry, node); } return ret; } static int io_reset_timeouts(struct io_timeout_override *root, uintptr_t ioaddr_base, unsigned int size) { assert(preemption_enabled()); struct io_timeout_override_entry key = { .ioaddr_base = ioaddr_base }; boolean_t istate = ml_set_interrupts_enabled(FALSE); lck_spin_lock(&io_timeout_override_lock); struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, root, &key); if (node) { if (node->size == size) { RB_REMOVE(io_timeout_override, root, node); } else { node = NULL; } } lck_spin_unlock(&io_timeout_override_lock); ml_set_interrupts_enabled(istate); if (!node) { return KERN_NOT_FOUND; } kfree_type(struct io_timeout_override_entry, node); return KERN_SUCCESS; } static bool io_override_timeout(struct io_timeout_override *root, uintptr_t addr, uint64_t *read_timeout, uint64_t *write_timeout) { assert(!ml_get_interrupts_enabled()); assert3p(read_timeout, !=, NULL); assert3p(write_timeout, !=, NULL); struct io_timeout_override_entry *node = RB_ROOT(root); lck_spin_lock(&io_timeout_override_lock); /* RB_FIND() doesn't support custom cmp functions, so we have to open-code our own */ while (node) { if (node->ioaddr_base <= addr && addr < node->ioaddr_base + node->size) { *read_timeout = node->read_timeout; *write_timeout = node->write_timeout; lck_spin_unlock(&io_timeout_override_lock); return true; } else if (addr < node->ioaddr_base) { node = RB_LEFT(node, tree); } else { node = RB_RIGHT(node, tree); } } lck_spin_unlock(&io_timeout_override_lock); return false; } static bool io_override_timeout_ss(uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout) { #if defined(__arm64__) /* * PCIe regions are marked with PMAP_IO_RANGE_STRONG_SYNC. Apply a * timeout greater than two PCIe completion timeouts (90ms) as they can * stack. */ #define STRONG_SYNC_TIMEOUT 2160000 /* 90ms */ pmap_io_range_t *range = pmap_find_io_attr(paddr); if (range != NULL && (range->wimg & PMAP_IO_RANGE_STRONG_SYNC) != 0) { *read_timeout = STRONG_SYNC_TIMEOUT; *write_timeout = STRONG_SYNC_TIMEOUT; return true; } #else (void)paddr; (void)read_timeout; (void)write_timeout; #endif /* __arm64__ */ return false; } /* * Return timeout override values for the read/write timeout for a given * address. * A virtual address (vaddr), physical address (paddr) or both may be passed. * Up to three separate timeout overrides can be found * - A virtual address override * - A physical address override * - A strong sync override * The largest override found is returned. */ void override_io_timeouts(uintptr_t vaddr, uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout) { uint64_t rt_va = 0, wt_va = 0, rt_pa = 0, wt_pa = 0, rt_ss = 0, wt_ss = 0; if (vaddr != 0) { /* Override from virtual address. */ io_override_timeout(&io_timeout_override_root_va, vaddr, &rt_va, &wt_va); } if (paddr != 0) { /* Override from physical address. */ io_override_timeout(&io_timeout_override_root_pa, paddr, &rt_pa, &wt_pa); /* Override from strong sync range. */ io_override_timeout_ss(paddr, &rt_ss, &wt_ss); } if (read_timeout != NULL) { *read_timeout = MAX(MAX(rt_va, rt_pa), rt_ss); } if (write_timeout != NULL) { *write_timeout = MAX(MAX(wt_va, wt_pa), wt_ss); } } #endif /* ML_IO_TIMEOUTS_ENABLED */ int ml_io_increase_timeouts(uintptr_t ioaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us) { #if ML_IO_TIMEOUTS_ENABLED const size_t MAX_SIZE = 4096; if (size > MAX_SIZE) { return KERN_INVALID_ARGUMENT; } return io_increase_timeouts(&io_timeout_override_root_va, ioaddr_base, size, read_timeout_us, write_timeout_us); #else #pragma unused(ioaddr_base, size, read_timeout_us, write_timeout_us) return KERN_SUCCESS; #endif /* ML_IO_TIMEOUTS_ENABLED */ } int ml_io_increase_timeouts_phys(vm_offset_t ioaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us) { #if ML_IO_TIMEOUTS_ENABLED return io_increase_timeouts(&io_timeout_override_root_pa, ioaddr_base, size, read_timeout_us, write_timeout_us); #else #pragma unused(ioaddr_base, size, read_timeout_us, write_timeout_us) return KERN_SUCCESS; #endif /* ML_IO_TIMEOUTS_ENABLED */ } int ml_io_reset_timeouts(uintptr_t ioaddr_base, unsigned int size) { #if ML_IO_TIMEOUTS_ENABLED return io_reset_timeouts(&io_timeout_override_root_va, ioaddr_base, size); #else #pragma unused(ioaddr_base, size) return KERN_SUCCESS; #endif /* ML_IO_TIMEOUTS_ENABLED */ } int ml_io_reset_timeouts_phys(vm_offset_t ioaddr_base, unsigned int size) { #if ML_IO_TIMEOUTS_ENABLED return io_reset_timeouts(&io_timeout_override_root_pa, ioaddr_base, size); #else #pragma unused(ioaddr_base, size) return KERN_SUCCESS; #endif /* ML_IO_TIMEOUTS_ENABLED */ } #if ML_IO_TIMEOUTS_ENABLED boolean_t ml_io_check_for_mmio_overrides(__unused uint64_t mt) { #if __arm64__ /* Issue a barrier before accessing the remote mmio trackers */ __builtin_arm_dmb(DMB_ISH); #endif boolean_t istate = ml_set_interrupts_enabled_with_debug(false, false); percpu_foreach(mmiot, mmio_tracker) { uint64_t read_timeout; uint64_t write_timeout; override_io_timeouts(mmiot->mmio_vaddr, mmiot->mmio_paddr, &read_timeout, &write_timeout); if (read_timeout > 0 || write_timeout > 0) { if (mt < (mmiot->mmio_start_mt + MAX(read_timeout, write_timeout))) { ml_set_interrupts_enabled_with_debug(istate, false); return true; } } } ml_set_interrupts_enabled_with_debug(istate, false); return false; } #endif /* ML_IO_TIMEOUTS_ENABLED */ #if DEVELOPMENT || DEBUG static int ml_io_read_test_mode; #endif unsigned long long ml_io_read(uintptr_t vaddr, int size) { unsigned long long result = 0; unsigned char s1; unsigned short s2; #if DEVELOPMENT || DEBUG /* For testing */ extern void IODelay(int); if (__improbable(ml_io_read_test_mode)) { if (vaddr == 1) { IODelay(100); return 0; } else if (vaddr == 2) { return 0; } } #endif /* DEVELOPMENT || DEBUG */ #ifdef ML_IO_VERIFY_UNCACHEABLE uintptr_t paddr = pmap_verify_noncacheable(vaddr); #elif defined(ML_IO_TIMEOUTS_ENABLED) uintptr_t paddr = 0; #endif #ifdef ML_IO_TIMEOUTS_ENABLED kern_timeout_t timeout; boolean_t istate, use_timeout = FALSE; uint64_t report_read_delay; #if __x86_64__ report_read_delay = report_phy_read_delay; #else report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed); uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed); #endif /* __x86_64__ */ if (__improbable(report_read_delay != 0)) { istate = ml_set_interrupts_enabled_with_debug(false, false); kern_timeout_start(&timeout, TF_NONSPEC_TIMEBASE | TF_SAMPLE_PMC); use_timeout = true; if (paddr == 0) { paddr = kvtophys(vaddr); } mmio_track_t *mmiot = PERCPU_GET(mmio_tracker); mmiot->mmio_start_mt = kern_timeout_start_time(&timeout); mmiot->mmio_paddr = paddr; mmiot->mmio_vaddr = vaddr; } #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED if (__improbable(use_timeout && simulate_stretched_io)) { kern_timeout_stretch(&timeout, simulate_stretched_io); } #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */ #endif /* ML_IO_TIMEOUTS_ENABLED */ #if DEVELOPMENT || DEBUG boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD); if (use_fences) { ml_timebase_to_memory_fence(); } #endif switch (size) { case 1: s1 = *(volatile unsigned char *)vaddr; result = s1; break; case 2: s2 = *(volatile unsigned short *)vaddr; result = s2; break; case 4: result = *(volatile unsigned int *)vaddr; break; case 8: result = *(volatile unsigned long long *)vaddr; break; default: panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr); break; } #if DEVELOPMENT || DEBUG if (use_fences) { ml_memory_to_timebase_fence(); } #endif #ifdef ML_IO_TIMEOUTS_ENABLED if (__improbable(use_timeout == TRUE)) { kern_timeout_end(&timeout, TF_NONSPEC_TIMEBASE); uint64_t duration = kern_timeout_gross_duration(&timeout); /* Prevent the processor from calling iotrace during its * initialization procedure. */ if (current_processor()->state == PROCESSOR_RUNNING) { iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, kern_timeout_start_time(&timeout), duration); } if (__improbable(duration > report_read_delay)) { DTRACE_PHYSLAT5(physioread, uint64_t, duration, uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result); uint64_t override = 0; override_io_timeouts(vaddr, paddr, &override, NULL); if (override != 0) { #if SCHED_HYGIENE_DEBUG /* * The IO timeout was overridden. If we were called in an * interrupt handler context, that can lead to a timeout * panic, so we need to abandon the measurement. */ if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) { ml_irq_debug_abandon(); } #endif report_read_delay = override; } } if (__improbable(duration > report_read_delay)) { if (phy_read_panic && (machine_timeout_suspended() == FALSE)) { char str[128]; #if defined(__x86_64__) panic_notify(); #endif /* defined(__x86_64__) */ snprintf(str, sizeof(str), "Read from IO vaddr 0x%lx paddr 0x%lx (result: 0x%llx) timed out:", vaddr, paddr, result); kern_timeout_try_panic(KERN_TIMEOUT_MMIO, paddr, &timeout, str, report_read_delay); } } if (__improbable(trace_phy_read_delay > 0 && duration > trace_phy_read_delay)) { KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ), duration, VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result); } (void)ml_set_interrupts_enabled_with_debug(istate, false); } #endif /* ML_IO_TIMEOUTS_ENABLED */ return result; } unsigned int ml_io_read8(uintptr_t vaddr) { return (unsigned) ml_io_read(vaddr, 1); } unsigned int ml_io_read16(uintptr_t vaddr) { return (unsigned) ml_io_read(vaddr, 2); } unsigned int ml_io_read32(uintptr_t vaddr) { return (unsigned) ml_io_read(vaddr, 4); } unsigned long long ml_io_read64(uintptr_t vaddr) { return ml_io_read(vaddr, 8); } uint64_t ml_io_read_cpu_reg(uintptr_t vaddr, int sz, __unused int logical_cpu) { uint64_t val; val = ml_io_read(vaddr, sz); return val; } /* ml_io_write* */ void ml_io_write(uintptr_t vaddr, uint64_t val, int size) { #ifdef ML_IO_VERIFY_UNCACHEABLE uintptr_t paddr = pmap_verify_noncacheable(vaddr); #elif defined(ML_IO_TIMEOUTS_ENABLED) uintptr_t paddr = 0; #endif #ifdef ML_IO_TIMEOUTS_ENABLED kern_timeout_t timeout; boolean_t istate, use_timeout = FALSE; uint64_t report_write_delay; #if __x86_64__ report_write_delay = report_phy_write_delay; #else report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed); uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed); #endif /* !defined(__x86_64__) */ if (__improbable(report_write_delay != 0)) { istate = ml_set_interrupts_enabled_with_debug(false, false); kern_timeout_start(&timeout, TF_NONSPEC_TIMEBASE | TF_SAMPLE_PMC); use_timeout = TRUE; if (paddr == 0) { paddr = kvtophys(vaddr); } mmio_track_t *mmiot = PERCPU_GET(mmio_tracker); mmiot->mmio_start_mt = kern_timeout_start_time(&timeout); mmiot->mmio_paddr = paddr; mmiot->mmio_vaddr = vaddr; } #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED if (__improbable(use_timeout && simulate_stretched_io)) { kern_timeout_stretch(&timeout, simulate_stretched_io); } #endif /* DEVELOPMENT || DEBUG */ #endif /* ML_IO_TIMEOUTS_ENABLED */ #if DEVELOPMENT || DEBUG boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD); if (use_fences) { ml_timebase_to_memory_fence(); } #endif switch (size) { case 1: *(volatile uint8_t *)vaddr = (uint8_t)val; break; case 2: *(volatile uint16_t *)vaddr = (uint16_t)val; break; case 4: *(volatile uint32_t *)vaddr = (uint32_t)val; break; case 8: *(volatile uint64_t *)vaddr = (uint64_t)val; break; default: panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val); break; } #if DEVELOPMENT || DEBUG if (use_fences) { ml_memory_to_timebase_fence(); } #endif #ifdef ML_IO_TIMEOUTS_ENABLED if (__improbable(use_timeout == TRUE)) { kern_timeout_end(&timeout, TF_NONSPEC_TIMEBASE); uint64_t duration = kern_timeout_gross_duration(&timeout); /* Prevent the processor from calling iotrace during its * initialization procedure. */ if (current_processor()->state == PROCESSOR_RUNNING) { iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, kern_timeout_start_time(&timeout), duration); } if (__improbable(duration > report_write_delay)) { DTRACE_PHYSLAT5(physiowrite, uint64_t, duration, uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val); uint64_t override = 0; override_io_timeouts(vaddr, paddr, NULL, &override); if (override != 0) { #if SCHED_HYGIENE_DEBUG /* * The IO timeout was overridden. If we were called in an * interrupt handler context, that can lead to a timeout * panic, so we need to abandon the measurement. */ if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) { ml_irq_debug_abandon(); } #endif report_write_delay = override; } } if (__improbable(duration > report_write_delay)) { if (phy_write_panic && (machine_timeout_suspended() == FALSE)) { char str[128]; #if defined(__x86_64__) panic_notify(); #endif /* defined(__x86_64__) */ snprintf(str, sizeof(str), "Write to IO vaddr 0x%lx paddr 0x%lx (value: 0x%llx) timed out:", vaddr, paddr, val); kern_timeout_try_panic(KERN_TIMEOUT_MMIO, paddr, &timeout, str, report_write_delay); } } if (__improbable(trace_phy_write_delay > 0 && duration > trace_phy_write_delay)) { KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE), duration, VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val); } (void)ml_set_interrupts_enabled_with_debug(istate, false); } #endif /* ML_IO_TIMEOUTS_ENABLED */ } void ml_io_write8(uintptr_t vaddr, uint8_t val) { ml_io_write(vaddr, val, 1); } void ml_io_write16(uintptr_t vaddr, uint16_t val) { ml_io_write(vaddr, val, 2); } void ml_io_write32(uintptr_t vaddr, uint32_t val) { ml_io_write(vaddr, val, 4); } void ml_io_write64(uintptr_t vaddr, uint64_t val) { ml_io_write(vaddr, val, 8); } struct cpu_callback_chain_elem { cpu_callback_t fn; void *param; struct cpu_callback_chain_elem *next; }; static struct cpu_callback_chain_elem *cpu_callback_chain; static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain"); static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp); struct cpu_event_log_entry { uint64_t abstime; enum cpu_event event; unsigned int cpu_or_cluster; }; #if DEVELOPMENT || DEBUG #define CPU_EVENT_RING_SIZE 128 static struct cpu_event_log_entry cpu_event_ring[CPU_EVENT_RING_SIZE]; static _Atomic int cpu_event_widx; static _Atomic uint64_t cpd_cycles; void cpu_event_debug_log(enum cpu_event event, unsigned int cpu_or_cluster) { int oldidx, newidx; os_atomic_rmw_loop(&cpu_event_widx, oldidx, newidx, relaxed, { newidx = (oldidx + 1) % CPU_EVENT_RING_SIZE; }); cpu_event_ring[newidx].abstime = ml_get_timebase(); cpu_event_ring[newidx].event = event; cpu_event_ring[newidx].cpu_or_cluster = cpu_or_cluster; if (event == CLUSTER_EXIT_REQUESTED) { os_atomic_inc(&cpd_cycles, relaxed); } } static const char * cpu_event_log_string(enum cpu_event e) { const char *event_strings[] = { "CPU_BOOT_REQUESTED", "CPU_BOOTED", "CPU_ACTIVE", "CLUSTER_ACTIVE", "CPU_EXIT_REQUESTED", "CPU_DOWN", "CLUSTER_EXIT_REQUESTED", "CPU_EXITED", "PLATFORM_QUIESCE", "PLATFORM_ACTIVE", "PLATFORM_HALT_RESTART", "PLATFORM_PANIC", "PLATFORM_PANIC_SYNC", "PLATFORM_PRE_SLEEP", "PLATFORM_POST_RESUME", }; assert((unsigned)e < sizeof(event_strings) / sizeof(event_strings[0])); return event_strings[e]; } void dump_cpu_event_log(int (*printf_func)(const char * fmt, ...)) { printf_func("CPU event history @ %016llx: (CPD cycles: %lld)\n", ml_get_timebase(), os_atomic_load(&cpd_cycles, relaxed)); int idx = os_atomic_load(&cpu_event_widx, relaxed); for (int c = 0; c < CPU_EVENT_RING_SIZE; c++) { idx = (idx + 1) % CPU_EVENT_RING_SIZE; struct cpu_event_log_entry *e = &cpu_event_ring[idx]; if (e->abstime != 0) { printf_func(" %016llx: %s %d\n", e->abstime, cpu_event_log_string(e->event), e->cpu_or_cluster); } } } #else /* DEVELOPMENT || DEBUG */ void cpu_event_debug_log(__unused enum cpu_event event, __unused unsigned int cpu_or_cluster) { /* no logging on production builds */ } void dump_cpu_event_log(__unused int (*printf_func)(const char * fmt, ...)) { } #endif /* DEVELOPMENT || DEBUG */ void cpu_event_register_callback(cpu_callback_t fn, void *param) { struct cpu_callback_chain_elem *new_elem; new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem); if (!new_elem) { panic("can't allocate cpu_callback_chain_elem"); } lck_spin_lock(&cpu_callback_chain_lock); new_elem->next = cpu_callback_chain; new_elem->fn = fn; new_elem->param = param; os_atomic_store(&cpu_callback_chain, new_elem, release); lck_spin_unlock(&cpu_callback_chain_lock); } __attribute__((noreturn)) void cpu_event_unregister_callback(__unused cpu_callback_t fn) { panic("Unfortunately, cpu_event_unregister_callback is unimplemented."); } void ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster) { struct cpu_callback_chain_elem *cursor; cpu_event_debug_log(event, cpu_or_cluster); cursor = os_atomic_load(&cpu_callback_chain, dependency); for (; cursor != NULL; cursor = cursor->next) { cursor->fn(cursor->param, event, cpu_or_cluster); } } // Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro // definition) void machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix, bool always_enabled) { if (!always_enabled && (wdt == -1 || (spec->skip_predicate != NULL && spec->skip_predicate(spec)))) { // This timeout should be disabled. os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed); return; } assert(suffix != NULL); assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN); size_t const suffix_len = strlen(suffix); size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + 1; char dt_name[dt_name_size]; strlcpy(dt_name, spec->name, dt_name_size); strlcat(dt_name, suffix, dt_name_size); size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen("-scale") + 1; char scale_name[scale_name_size]; strlcpy(scale_name, spec->name, scale_name_size); strlcat(scale_name, suffix, scale_name_size); strlcat(scale_name, "-scale", scale_name_size); size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen("ml-timeout-") + suffix_len + 1; char boot_arg_name[boot_arg_name_size]; strlcpy(boot_arg_name, "ml-timeout-", boot_arg_name_size); strlcat(boot_arg_name, spec->name, boot_arg_name_size); strlcat(boot_arg_name, suffix, boot_arg_name_size); size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen("ml-timeout-") + strlen("-scale") + suffix_len + 1; char boot_arg_scale_name[boot_arg_scale_name_size]; strlcpy(boot_arg_scale_name, "ml-timeout-", boot_arg_scale_name_size); strlcat(boot_arg_scale_name, spec->name, boot_arg_scale_name_size); strlcat(boot_arg_scale_name, suffix, boot_arg_name_size); strlcat(boot_arg_scale_name, "-scale", boot_arg_scale_name_size); /* * Determine base value from DT and boot-args. */ DTEntry base, chosen; if (SecureDTLookupEntry(NULL, "/machine-timeouts", &base) != kSuccess) { base = NULL; } if (SecureDTLookupEntry(NULL, "/chosen/machine-timeouts", &chosen) != kSuccess) { chosen = NULL; } uint64_t timeout = spec->default_value; bool found = false; uint64_t const *data = NULL; unsigned int data_size = sizeof(*data); /* First look in /machine-timeouts/ */ if (base != NULL && SecureDTGetProperty(base, dt_name, (const void **)&data, &data_size) == kSuccess) { if (data_size != sizeof(*data)) { panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name); } timeout = *data; found = true; } /* A value in /chosen/machine-timeouts/ overrides */ if (chosen != NULL && SecureDTGetProperty(chosen, dt_name, (const void **)&data, &data_size) == kSuccess) { if (data_size != sizeof(*data)) { panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name); } timeout = *data; found = true; } /* A boot-arg ml-timeout- overrides */ uint64_t boot_arg = 0; if (PE_parse_boot_argn(boot_arg_name, &boot_arg, sizeof(boot_arg))) { timeout = boot_arg; found = true; } /* * Determine scale value from DT and boot-args. */ uint64_t scale = 1; uint32_t const *scale_data; unsigned int scale_size = sizeof(scale_data); /* If there is a scale factor /machine-timeouts/-scale, apply it. */ if (base != NULL && SecureDTGetProperty(base, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) { if (scale_size != sizeof(*scale_data)) { panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name); } scale = *scale_data; } /* If there is a scale factor /chosen/machine-timeouts/-scale, use that. */ if (chosen != NULL && SecureDTGetProperty(chosen, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) { if (scale_size != sizeof(*scale_data)) { panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__, scale_size, dt_name); } scale = *scale_data; } /* Finally, a boot-arg ml-timeout--scale takes precedence. */ if (PE_parse_boot_argn(boot_arg_scale_name, &boot_arg, sizeof(boot_arg))) { scale = boot_arg; } static bool global_scale_set; static uint64_t global_scale; if (!global_scale_set) { /* Apply /machine-timeouts/global-scale if present */ if (SecureDTGetProperty(base, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) { if (scale_size != sizeof(*scale_data)) { panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__, scale_size); } global_scale = *scale_data; global_scale_set = true; } /* Use /chosen/machine-timeouts/global-scale if present */ if (SecureDTGetProperty(chosen, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) { if (scale_size != sizeof(*scale_data)) { panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__, scale_size); } global_scale = *scale_data; global_scale_set = true; } /* Finally, the boot-arg ml-timeout-global-scale takes precedence. */ if (PE_parse_boot_argn("ml-timeout-global-scale", &boot_arg, sizeof(boot_arg))) { global_scale = boot_arg; global_scale_set = true; } } if (global_scale_set) { scale *= global_scale; } /* Compute the final timeout, and done. */ if (found && timeout > 0) { /* Only apply inherent unit scale if the value came in * externally. */ if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) { uint64_t nanoseconds = timeout / 1000; nanoseconds_to_absolutetime(nanoseconds, &timeout); } else { timeout /= spec->unit_scale; } if (timeout == 0) { /* Ensure unit scaling did not disable the timeout. */ timeout = 1; } } if (os_mul_overflow(timeout, scale, &timeout)) { timeout = UINT64_MAX; // clamp } os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed); } void machine_timeout_init(const struct machine_timeout_spec *spec) { machine_timeout_init_with_suffix(spec, "", false); } void machine_timeout_init_always_enabled(const struct machine_timeout_spec *spec) { machine_timeout_init_with_suffix(spec, "", true); } #if DEVELOPMENT || DEBUG /* * Late timeout (re-)initialization, at the end of bsd_init() */ void machine_timeout_bsd_init(void) { char const * const __unused mt_suffix = "-b"; #if SCHED_HYGIENE_DEBUG machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix, false); machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix, false); /* * The io timeouts can inherit from interrupt_masked_timeout. * Re-initialize, as interrupt_masked_timeout may have changed. */ ml_io_init_timeouts(); extern void preemption_disable_reset_max_durations(void); /* * Reset the preemption disable stats, so that they are not * polluted by long early boot code. */ preemption_disable_reset_max_durations(); #endif /* SCHED_HYGIENE_DEBUG */ } #endif /* DEVELOPMENT || DEBUG */ #if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST #include extern kern_return_t ml_io_timeout_test(void); static inline void ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout) { *read_timeout = 0; *write_timeout = 0; vm_offset_t paddr = kvtophys(vaddr); boolean_t istate = ml_set_interrupts_enabled(FALSE); override_io_timeouts(vaddr, paddr, read_timeout, write_timeout); ml_set_interrupts_enabled(istate); } static inline void ml_io_timeout_test_get_timeouts_phys(vm_offset_t paddr, uint64_t *read_timeout, uint64_t *write_timeout) { *read_timeout = 0; *write_timeout = 0; boolean_t istate = ml_set_interrupts_enabled(FALSE); override_io_timeouts(0, paddr, read_timeout, write_timeout); ml_set_interrupts_enabled(istate); } kern_return_t ml_io_timeout_test(void) { const size_t SIZE = 16; /* * Page align the base address to ensure that the regions are physically * contiguous. */ const uintptr_t iovaddr_base1 = (uintptr_t)kernel_pmap & ~PAGE_MASK; const uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE; const uintptr_t vaddr1 = iovaddr_base1 + SIZE / 2; const uintptr_t vaddr2 = iovaddr_base2 + SIZE / 2; const vm_offset_t iopaddr_base1 = kvtophys(iovaddr_base1); const vm_offset_t iopaddr_base2 = kvtophys(iovaddr_base2); const vm_offset_t paddr1 = iopaddr_base1 + SIZE / 2; const vm_offset_t paddr2 = iopaddr_base2 + SIZE / 2; const uint64_t READ_TIMEOUT1_US = 50000, WRITE_TIMEOUT1_US = 50001; const uint64_t READ_TIMEOUT2_US = 50002, WRITE_TIMEOUT2_US = 50003; uint64_t read_timeout1_abs, write_timeout1_abs; uint64_t read_timeout2_abs, write_timeout2_abs; nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs); nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs); nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs); nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs); int err = ml_io_increase_timeouts(iovaddr_base1, 0, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US); T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region"); err = ml_io_increase_timeouts(iovaddr_base1, 4097, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US); T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes"); err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US); T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region"); err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US); T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed"); err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US); T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed"); err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US); T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice"); err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US); T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions"); uint64_t read_timeout, write_timeout; ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout); T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region"); T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region"); ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout); T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region"); T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region"); ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout); T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout without override"); T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout without override"); err = ml_io_reset_timeouts(iovaddr_base1 + 1, SIZE - 1); T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion"); err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE); T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region"); err = ml_io_reset_timeouts(iovaddr_base1, SIZE); T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed"); ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout); T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region"); T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region"); err = ml_io_reset_timeouts(iovaddr_base1, SIZE); T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice"); err = ml_io_reset_timeouts(iovaddr_base2, SIZE); T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed"); err = ml_io_increase_timeouts_phys(iopaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US); T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first PA region should succeed"); err = ml_io_increase_timeouts_phys(iopaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US); T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second PA region should succeed"); ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout); T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region"); T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region"); ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout); T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region"); T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region"); ml_io_timeout_test_get_timeouts_phys(paddr1, &read_timeout, &write_timeout); T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region"); T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region"); ml_io_timeout_test_get_timeouts_phys(paddr2, &read_timeout, &write_timeout); T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first physical region"); T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first physical region"); err = ml_io_reset_timeouts_phys(iopaddr_base1, SIZE); T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first PA region should succeed"); err = ml_io_reset_timeouts_phys(iopaddr_base2, SIZE); T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second PA region should succeed"); ml_io_timeout_test_get_timeouts_phys(paddr1, &read_timeout, &write_timeout); T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region"); T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region"); return KERN_SUCCESS; } #endif /* CONFIG_XNUPOST */ #if DEVELOPMENT || DEBUG static int ml_io_read_cpu_reg_test(__unused int64_t in, int64_t *out) { printf("Testing ml_io_read_cpu_reg()...\n"); ml_io_read_test_mode = 1; boolean_t istate = ml_set_interrupts_enabled_with_debug(false, false); (void) ml_io_read_cpu_reg((uintptr_t)1, 8, 1); (void) ml_io_read_cpu_reg((uintptr_t)2, 8, 1); ml_set_interrupts_enabled_with_debug(istate, false); (void) ml_io_read_cpu_reg((uintptr_t)1, 8, 1); (void) ml_io_read_cpu_reg((uintptr_t)2, 8, 1); ml_io_read_test_mode = 0; *out = 0; return 0; } SYSCTL_TEST_REGISTER(ml_io_read_cpu_reg, ml_io_read_cpu_reg_test); #endif /* DEVELOPMENT || DEBUG */