/* * Copyright (c) 2000-2023 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ * */ #ifndef I386_CPU_DATA #define I386_CPU_DATA #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if CONFIG_VMX #include #endif #if MONOTONIC #include #endif /* MONOTONIC */ #include #include /* * Data structures referenced (anonymously) from per-cpu data: */ struct cpu_cons_buffer; struct cpu_desc_table; struct mca_state; struct prngContext; /* * Data structures embedded in per-cpu data: */ typedef struct rtclock_timer { mpqueue_head_t queue; uint64_t deadline; uint64_t when_set; boolean_t has_expired; } rtclock_timer_t; typedef struct { /* The 'u' suffixed fields store the double-mapped descriptor addresses */ struct x86_64_tss *cdi_ktssu; struct x86_64_tss *cdi_ktssb; x86_64_desc_register_t cdi_gdtu; x86_64_desc_register_t cdi_gdtb; x86_64_desc_register_t cdi_idtu; x86_64_desc_register_t cdi_idtb; struct real_descriptor *cdi_ldtu; struct real_descriptor *cdi_ldtb; vm_offset_t cdi_sstku; vm_offset_t cdi_sstkb; } cpu_desc_index_t; typedef enum { TASK_MAP_32BIT, /* 32-bit user, compatibility mode */ TASK_MAP_64BIT, /* 64-bit user thread, shared space */ } task_map_t; /* * This structure is used on entry into the (uber-)kernel on syscall from * a 64-bit user. It contains the address of the machine state save area * for the current thread and a temporary place to save the user's rsp * before loading this address into rsp. */ typedef struct { addr64_t cu_isf; /* thread->pcb->iss.isf */ uint64_t cu_tmp; /* temporary scratch */ addr64_t cu_user_gs_base; } cpu_uber_t; typedef uint16_t pcid_t; typedef uint8_t pcid_ref_t; #define CPU_RTIME_BINS (12) #define CPU_ITIME_BINS (CPU_RTIME_BINS) #define MAX_TRACE_BTFRAMES (16) typedef struct { boolean_t pltype; int plevel; uint64_t plbt[MAX_TRACE_BTFRAMES]; } plrecord_t; #if DEVELOPMENT || DEBUG typedef struct { int vector; /* Vector number of interrupt */ thread_t curthread; /* Current thread at the time of the interrupt */ uint64_t interrupted_pc; int curpl; /* Current preemption level */ int curil; /* Current interrupt level */ uint64_t start_time_abs; uint64_t duration; uint64_t backtrace[MAX_TRACE_BTFRAMES]; } traptrace_entry_t; #define TRAPTRACE_INVALID_INDEX (~0U) #define DEFAULT_TRAPTRACE_ENTRIES_PER_CPU (16) #define TRAPTRACE_MAX_ENTRIES_PER_CPU (256) extern volatile int traptrace_enabled; extern uint32_t traptrace_entries_per_cpu; PERCPU_DECL(uint32_t, traptrace_next); PERCPU_DECL(traptrace_entry_t * __unsafe_indexable, traptrace_ring); #endif /* DEVELOPMENT || DEBUG */ /* * Per-cpu data. * * Each processor has a per-cpu data area which is dereferenced through the * current_cpu_datap() macro. For speed, the %gs segment is based here, and * using this, inlines provides single-instruction access to frequently used * members - such as get_cpu_number()/cpu_number(), and get_active_thread()/ * current_thread(). * * Cpu data owned by another processor can be accessed using the * cpu_datap(cpu_number) macro which uses the cpu_data_ptr[] array of per-cpu * pointers. */ typedef struct { pcid_t cpu_pcid_free_hint; #define PMAP_PCID_MAX_PCID (0x800) pcid_ref_t cpu_pcid_refcounts[PMAP_PCID_MAX_PCID]; pmap_t cpu_pcid_last_pmap_dispatched[PMAP_PCID_MAX_PCID]; } pcid_cdata_t; typedef struct cpu_data { struct pal_cpu_data cpu_pal_data; /* PAL-specific data */ #define cpu_pd cpu_pal_data /* convenience alias */ struct cpu_data *cpu_this; /* pointer to myself */ vm_offset_t cpu_pcpu_base; thread_t cpu_active_thread; thread_t cpu_nthread; int cpu_number; /* Logical CPU */ void *cpu_int_state; /* interrupt state */ vm_offset_t cpu_active_stack; /* kernel stack base */ vm_offset_t cpu_kernel_stack; /* kernel stack top */ vm_offset_t cpu_int_stack_top; volatile int cpu_signals; /* IPI events */ volatile int cpu_prior_signals; /* Last set of events, * debugging */ ast_t cpu_pending_ast; /* * Note if rearranging fields: * We want cpu_preemption_level on a different * cache line than cpu_active_thread * for optimizing mtx_spin phase. */ int cpu_interrupt_level; volatile int cpu_preemption_level; volatile int cpu_running; #if !MONOTONIC boolean_t cpu_fixed_pmcs_enabled; #endif /* !MONOTONIC */ rtclock_timer_t rtclock_timer; volatile addr64_t cpu_active_cr3 __attribute((aligned(64))); union { volatile uint32_t cpu_tlb_invalid; struct { volatile uint16_t cpu_tlb_invalid_local; volatile uint16_t cpu_tlb_invalid_global; }; }; uint64_t cpu_ip_desc[2]; volatile task_map_t cpu_task_map; volatile addr64_t cpu_task_cr3; addr64_t cpu_kernel_cr3; volatile addr64_t cpu_ucr3; volatile addr64_t cpu_shadowtask_cr3; boolean_t cpu_pagezero_mapped; cpu_uber_t cpu_uber; /* Double-mapped per-CPU exception stack address */ uintptr_t cd_estack; int cpu_xstate; int cpu_curtask_has_ldt; int cpu_curthread_do_segchk; /* Address of shadowed, partially mirrored CPU data structures located * in the double mapped PML4 */ void *cd_shadow; union { volatile uint32_t cpu_tlb_invalid_count; struct { volatile uint16_t cpu_tlb_invalid_local_count; volatile uint16_t cpu_tlb_invalid_global_count; }; }; uint16_t cpu_tlb_gen_counts_local[MAX_CPUS]; uint16_t cpu_tlb_gen_counts_global[MAX_CPUS]; struct processor *cpu_processor; struct real_descriptor *cpu_ldtp; struct cpu_desc_table *cpu_desc_tablep; cpu_desc_index_t cpu_desc_index; int cpu_ldt; #define HWINTCNT_SIZE 256 uint32_t cpu_hwIntCnt[HWINTCNT_SIZE]; /* Interrupt counts */ uint64_t cpu_hwIntpexits[HWINTCNT_SIZE]; uint64_t cpu_dr7; /* debug control register */ uint64_t cpu_int_event_time; /* intr entry/exit time */ pal_rtc_nanotime_t *cpu_nanotime; /* Nanotime info */ #if KPC /* double-buffered performance counter data */ uint64_t *cpu_kpc_buf[2]; /* PMC shadow and reload value buffers */ uint64_t *cpu_kpc_shadow; uint64_t *cpu_kpc_reload; #endif #if MONOTONIC struct mt_cpu cpu_monotonic; #endif /* MONOTONIC */ uint32_t cpu_pmap_pcid_enabled; pcid_t cpu_active_pcid; pcid_t cpu_last_pcid; pcid_t cpu_kernel_pcid; volatile pcid_ref_t *cpu_pmap_pcid_coherentp; volatile pcid_ref_t *cpu_pmap_pcid_coherentp_kernel; pcid_cdata_t *cpu_pcid_data; #ifdef PCID_STATS uint64_t cpu_pmap_pcid_flushes; uint64_t cpu_pmap_pcid_preserves; #endif uint64_t cpu_aperf; uint64_t cpu_mperf; uint64_t cpu_c3res; uint64_t cpu_c6res; uint64_t cpu_c7res; uint64_t cpu_itime_total; uint64_t cpu_rtime_total; uint64_t cpu_ixtime; uint64_t cpu_idle_exits; /* * Note that the cacheline-copy mechanism uses the cpu_rtimes field in the shadow CPU * structures to temporarily stash the code cacheline that includes the instruction * pointer at the time of the fault (this field is otherwise unused in the shadow * CPU structures). */ uint64_t cpu_rtimes[CPU_RTIME_BINS]; uint64_t cpu_itimes[CPU_ITIME_BINS]; #if !MONOTONIC uint64_t cpu_cur_insns; uint64_t cpu_cur_ucc; uint64_t cpu_cur_urc; #endif /* !MONOTONIC */ uint64_t cpu_gpmcs[4]; uint64_t cpu_max_observed_int_latency; int cpu_max_observed_int_latency_vector; volatile boolean_t cpu_NMI_acknowledged; uint64_t debugger_entry_time; uint64_t debugger_ipi_time; /* A separate nested interrupt stack flag, to account * for non-nested interrupts arriving while on the interrupt stack * Currently only occurs when AICPM enables interrupts on the * interrupt stack during processor offlining. */ uint32_t cpu_nested_istack; uint32_t cpu_nested_istack_events; x86_saved_state64_t *cpu_fatal_trap_state; x86_saved_state64_t *cpu_post_fatal_trap_state; #if CONFIG_VMX vmx_cpu_t cpu_vmx; /* wonderful world of virtualization */ #endif #if CONFIG_MCA struct mca_state *cpu_mca_state; /* State at MC fault */ #endif int cpu_type; int cpu_subtype; int cpu_threadtype; boolean_t cpu_iflag; boolean_t cpu_boot_complete; int cpu_hibernate; #define MAX_PREEMPTION_RECORDS (8) #if DEVELOPMENT || DEBUG int cpu_plri; plrecord_t plrecords[MAX_PREEMPTION_RECORDS]; #endif struct x86_lcpu lcpu; int cpu_phys_number; /* Physical CPU */ cpu_id_t cpu_id; /* Platform Expert */ #if DEBUG uint64_t cpu_entry_cr3; uint64_t cpu_exit_cr3; uint64_t cpu_pcid_last_cr3; #endif boolean_t cpu_rendezvous_in_progress; #if CST_DEMOTION_DEBUG /* Count of thread wakeups issued by this processor */ uint64_t cpu_wakeups_issued_total; #endif #if DEBUG || DEVELOPMENT uint64_t tsc_sync_delta; #endif uint32_t cpu_soft_apic_lvt_timer; #if CONFIG_KCOV kcov_cpu_data_t cpu_kcov_data; #endif } cpu_data_t; extern cpu_data_t *__single cpu_data_ptr[MAX_CPUS]; /* * __SEG_GS marks %gs-relative operations: * https://clang.llvm.org/docs/LanguageExtensions.html#memory-references-to-specified-segments * https://gcc.gnu.org/onlinedocs/gcc/Named-Address-Spaces.html#x86-Named-Address-Spaces */ #if defined(__SEG_GS) // __seg_gs exists #elif defined(__clang__) #define __seg_gs __attribute__((address_space(256))) #else #error use a compiler that supports address spaces or __seg_gs #endif #define CPU_DATA() ((cpu_data_t __seg_gs *)0UL) /* * Everyone within the osfmk part of the kernel can use the fast * inline versions of these routines. Everyone outside, must call * the real thing, */ /* * The "volatile" flavor of current_thread() is intended for use by * scheduler code which may need to update the thread pointer in the * course of a context switch. Any call to current_thread() made * prior to the thread pointer update should be safe to optimize away * as it should be consistent with that thread's state to the extent * the compiler can reason about it. Likewise, the context switch * path will eventually result in an arbitrary branch to the new * thread's pc, about which the compiler won't be able to reason. * Thus any compile-time optimization of current_thread() calls made * within the new thread should be safely encapsulated in its * register/stack state. The volatile form therefore exists to cover * the window between the thread pointer update and the branch to * the new pc. */ static inline thread_t get_active_thread_volatile(void) { return CPU_DATA()->cpu_active_thread; } static inline __attribute__((const)) thread_t get_active_thread(void) { return CPU_DATA()->cpu_active_thread; } #define current_thread_fast() get_active_thread() #define current_thread_volatile() get_active_thread_volatile() #define cpu_mode_is64bit() TRUE static inline int get_preemption_level(void) { return CPU_DATA()->cpu_preemption_level; } static inline int get_interrupt_level(void) { return CPU_DATA()->cpu_interrupt_level; } static inline int get_cpu_number(void) { return CPU_DATA()->cpu_number; } static inline vm_offset_t get_current_percpu_base(void) { return CPU_DATA()->cpu_pcpu_base; } static inline int get_cpu_phys_number(void) { return CPU_DATA()->cpu_phys_number; } static inline cpu_data_t * current_cpu_datap(void) { return CPU_DATA()->cpu_this; } /* * Facility to diagnose preemption-level imbalances, which are otherwise * challenging to debug. On each operation that enables or disables preemption, * we record a backtrace into a per-CPU ring buffer, along with the current * preemption level and operation type. Thus, if an imbalance is observed, * one can examine these per-CPU records to determine which codepath failed * to re-enable preemption, enabled premption without a corresponding * disablement etc. The backtracer determines which stack is currently active, * and uses that to perform bounds checks on unterminated stacks. * To enable, sysctl -w machdep.pltrace=1 on DEVELOPMENT or DEBUG kernels (DRK '15) * The bounds check currently doesn't account for non-default thread stack sizes. */ #if DEVELOPMENT || DEBUG static inline void rbtrace_bt(uint64_t *__counted_by(maxframes)rets, int maxframes, cpu_data_t *cdata, uint64_t frameptr, bool use_cursp) { extern uint32_t low_intstack[]; /* bottom */ extern uint32_t low_eintstack[]; /* top */ extern char mp_slave_stack[]; int btidx = 0; uint64_t kstackb, kstackt; /* Obtain the 'current' program counter, initial backtrace * element. This will also indicate if we were unable to * trace further up the stack for some reason */ if (use_cursp) { __asm__ volatile ("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" : "=m" (rets[btidx++]) : : "rax"); } thread_t __single cplthread = cdata->cpu_active_thread; if (cplthread) { uintptr_t csp; if (use_cursp == true) { __asm__ __volatile__ ("movq %%rsp, %0": "=r" (csp):); } else { csp = frameptr; } /* Determine which stack we're on to populate stack bounds. * We don't need to trace across stack boundaries for this * routine. */ kstackb = cdata->cpu_active_stack; kstackt = kstackb + KERNEL_STACK_SIZE; if (csp < kstackb || csp > kstackt) { kstackt = cdata->cpu_kernel_stack; kstackb = kstackt - KERNEL_STACK_SIZE; if (csp < kstackb || csp > kstackt) { kstackt = cdata->cpu_int_stack_top; kstackb = kstackt - INTSTACK_SIZE; if (csp < kstackb || csp > kstackt) { kstackt = (uintptr_t)&low_eintstack; kstackb = kstackt - INTSTACK_SIZE; if (csp < kstackb || csp > kstackt) { kstackb = (uintptr_t)&mp_slave_stack; kstackt = kstackb + PAGE_SIZE; } else { kstackb = 0; kstackt = 0; } } } } if (__probable(kstackb && kstackt)) { uint64_t *cfp = __unsafe_forge_single(uint64_t *, frameptr); int rbbtf; for (rbbtf = btidx; rbbtf < maxframes; rbbtf++) { uint64_t cur_retp; /* * cfp == 0 is covered by the first comparison, and we're guaranteed * that kstackb is non-zero from the containing if block. The os_add_overflow is * necessary because it's not uncommon for backtraces to terminate with bogus * frame pointers. */ if (((uint64_t)cfp < kstackb) || os_add_overflow((uint64_t)cfp, sizeof(uint64_t), &cur_retp) || cur_retp >= kstackt) { rets[rbbtf] = 0; continue; } rets[rbbtf] = *(uint64_t *)cur_retp; cfp = __unsafe_forge_single(uint64_t *, *cfp); } } } } __attribute__((noinline)) static inline void pltrace_internal(boolean_t enable) { cpu_data_t *cdata = current_cpu_datap(); int cpli = cdata->cpu_preemption_level; int cplrecord = cdata->cpu_plri; uint64_t *plbts; assert(cpli >= 0); cdata->plrecords[cplrecord].pltype = enable; cdata->plrecords[cplrecord].plevel = cpli; plbts = &cdata->plrecords[cplrecord].plbt[0]; cplrecord++; if (cplrecord >= MAX_PREEMPTION_RECORDS) { cplrecord = 0; } cdata->cpu_plri = cplrecord; rbtrace_bt(plbts, MAX_TRACE_BTFRAMES - 1, cdata, (uint64_t)__builtin_frame_address(0), false); } extern int plctrace_enabled; static inline uint32_t traptrace_start(int vecnum, uint64_t ipc, uint64_t sabs, uint64_t frameptr) { cpu_data_t *cdata; uint32_t nextidx; traptrace_entry_t *cur_traptrace_ring; uint32_t *nextidxp; if (__improbable(traptrace_enabled == 0 || traptrace_entries_per_cpu == 0)) { return TRAPTRACE_INVALID_INDEX; } assert(ml_get_interrupts_enabled() == FALSE); cdata = current_cpu_datap(); nextidxp = PERCPU_GET(traptrace_next); nextidx = *nextidxp; /* prevent nested interrupts from clobbering this record */ *nextidxp = (((nextidx + 1) >= (unsigned int)traptrace_entries_per_cpu) ? 0 : (nextidx + 1)); cur_traptrace_ring = __unsafe_forge_bidi_indexable(traptrace_entry_t *, *PERCPU_GET(traptrace_ring), sizeof(traptrace_entry_t) * traptrace_entries_per_cpu); cur_traptrace_ring[nextidx].vector = vecnum; cur_traptrace_ring[nextidx].curthread = current_thread_fast(); cur_traptrace_ring[nextidx].interrupted_pc = ipc; cur_traptrace_ring[nextidx].curpl = cdata->cpu_preemption_level; cur_traptrace_ring[nextidx].curil = cdata->cpu_interrupt_level; cur_traptrace_ring[nextidx].start_time_abs = sabs; cur_traptrace_ring[nextidx].duration = ~0ULL; rbtrace_bt(&cur_traptrace_ring[nextidx].backtrace[0], MAX_TRACE_BTFRAMES - 1, cdata, frameptr, false); assert(nextidx <= 0xFFFF); /* * encode the cpu number we're on because traptrace_end() * might be called from a different CPU. */ return ((uint32_t)cdata->cpu_number << 16) | nextidx; } static inline void traptrace_end(uint32_t index, uint64_t eabs) { traptrace_entry_t *__unsafe_indexable ring; if (index != TRAPTRACE_INVALID_INDEX) { ring = *PERCPU_GET_WITH_BASE(other_percpu_base(index >> 16), traptrace_ring); index &= 0XFFFF; ring[index].duration = eabs - ring[index].start_time_abs; } } #endif /* DEVELOPMENT || DEBUG */ __header_always_inline void pltrace(boolean_t plenable) { #if DEVELOPMENT || DEBUG if (__improbable(plctrace_enabled != 0)) { pltrace_internal(plenable); } #else (void)plenable; #endif } static inline void disable_preemption_internal(void) { assert(get_preemption_level() >= 0); os_compiler_barrier(); CPU_DATA()->cpu_preemption_level++; os_compiler_barrier(); pltrace(FALSE); } static inline void enable_preemption_internal(void) { assert(get_preemption_level() > 0); pltrace(TRUE); os_compiler_barrier(); if (0 == --CPU_DATA()->cpu_preemption_level) { kernel_preempt_check(); } os_compiler_barrier(); } static inline void enable_preemption_no_check(void) { assert(get_preemption_level() > 0); pltrace(TRUE); os_compiler_barrier(); CPU_DATA()->cpu_preemption_level--; os_compiler_barrier(); } static inline void _enable_preemption_no_check(void) { enable_preemption_no_check(); } static inline void mp_disable_preemption(void) { disable_preemption_internal(); } static inline void _mp_disable_preemption(void) { disable_preemption_internal(); } static inline void mp_enable_preemption(void) { enable_preemption_internal(); } static inline void _mp_enable_preemption(void) { enable_preemption_internal(); } static inline void mp_enable_preemption_no_check(void) { enable_preemption_no_check(); } static inline void _mp_enable_preemption_no_check(void) { enable_preemption_no_check(); } #ifdef XNU_KERNEL_PRIVATE #define disable_preemption() disable_preemption_internal() #define disable_preemption_without_measurements() disable_preemption_internal() #define enable_preemption() enable_preemption_internal() #define MACHINE_PREEMPTION_MACROS (1) #endif static inline cpu_data_t * cpu_datap(int cpu) { return cpu_data_ptr[cpu]; } static inline int cpu_is_running(int cpu) { return (cpu_datap(cpu) != NULL) && (cpu_datap(cpu)->cpu_running); } #ifdef MACH_KERNEL_PRIVATE static inline cpu_data_t * cpu_shadowp(int cpu) { return cpu_data_ptr[cpu]->cd_shadow; } #endif extern cpu_data_t *cpu_data_alloc(boolean_t is_boot_cpu); extern void cpu_data_realloc(void); #endif /* I386_CPU_DATA */