1 /*
2 * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <arm64/machine_machdep.h>
30 #include <arm64/proc_reg.h>
31 #include <arm/machine_cpu.h>
32 #include <arm/cpu_internal.h>
33 #include <arm/cpuid.h>
34 #include <arm/cpu_data.h>
35 #include <arm/cpu_data_internal.h>
36 #include <arm/caches_internal.h>
37 #include <arm/misc_protos.h>
38 #include <arm/machdep_call.h>
39 #include <arm/machine_routines.h>
40 #include <arm/rtclock.h>
41 #include <arm/cpuid_internal.h>
42 #include <arm/cpu_capabilities.h>
43 #include <console/serial_protos.h>
44 #include <kern/machine.h>
45 #include <kern/misc_protos.h>
46 #include <prng/random.h>
47 #include <kern/startup.h>
48 #include <kern/thread.h>
49 #include <kern/timer_queue.h>
50 #include <mach/machine.h>
51 #include <machine/atomic.h>
52 #include <machine/config.h>
53 #include <vm/pmap.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_page_internal.h>
56 #include <vm/vm_pageout_xnu.h>
57 #include <vm/vm_shared_region_xnu.h>
58 #include <vm/vm_map_xnu.h>
59 #include <vm/vm_kern_xnu.h>
60 #include <sys/codesign.h>
61 #include <sys/kdebug.h>
62 #include <kern/coalition.h>
63 #include <pexpert/device_tree.h>
64 #include <pexpert/arm64/board_config.h>
65
66 #include <IOKit/IOPlatformExpert.h>
67 #if HIBERNATION
68 #include <IOKit/IOHibernatePrivate.h>
69 #endif /* HIBERNATION */
70
71 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
72 #include <arm64/amcc_rorgn.h>
73 #endif
74
75
76 #if CONFIG_SPTM
77 #include <arm64/sptm/sptm.h>
78 #endif /* CONFIG_SPTM */
79
80 #include <libkern/OSAtomic.h>
81 #include <libkern/section_keywords.h>
82
83 /**
84 * On supported hardware, debuggable builds make the HID bits read-only
85 * without locking them. This lets people manually modify HID bits while
86 * debugging, since they can use a debugging tool to first reset the HID
87 * bits back to read/write. However it will still catch xnu changes that
88 * accidentally write to HID bits after they've been made read-only.
89 */
90 SECURITY_READ_ONLY_LATE(bool) skip_spr_lockdown_glb = 0;
91
92 /*
93 * On some SoCs, PIO lockdown is applied in assembly in early boot by
94 * secondary CPUs.
95 * Since the cluster_pio_ro_ctl value is dynamic, it is stored here by the
96 * primary CPU so that it doesn't have to be computed each time by the
97 * startup code.
98 */
99 SECURITY_READ_ONLY_LATE(uint64_t) cluster_pio_ro_ctl_mask_glb = 0;
100
101 #if CONFIG_CPU_COUNTERS
102 #include <kern/kpc.h>
103 #endif /* CONFIG_CPU_COUNTERS */
104
105 #define MPIDR_CPU_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
106 #define MPIDR_CLUSTER_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
107
108 #if HAS_CLUSTER
109 static uint8_t cluster_initialized = 0;
110 #endif
111
112 MACHINE_TIMEOUT_DEV_WRITEABLE(LockTimeOut, "lock", 6e6 /* 0.25s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
113 machine_timeout_t LockTimeOutUsec; // computed in ml_init_lock_timeout
114
115 MACHINE_TIMEOUT_DEV_WRITEABLE(TLockTimeOut, "ticket-lock", 3e6 /* 0.125s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
116
117 TUNABLE_DEV_WRITEABLE(uint64_t, MutexSpin, "mutex-spin", 240 /* 10us */);
118
119 uint64_t low_MutexSpin;
120 int64_t high_MutexSpin;
121
122
123
124 static uint64_t ml_wfe_hint_max_interval;
125 #define MAX_WFE_HINT_INTERVAL_US (500ULL)
126
127 /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
128 TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
129
130 extern vm_offset_t segLOWEST;
131 extern vm_offset_t segLOWESTTEXT;
132 extern vm_offset_t segLASTB;
133 extern unsigned long segSizeLAST;
134
135 /* ARM64 specific bounds; used to test for presence in the kernelcache. */
136 extern vm_offset_t vm_kernelcache_base;
137 extern vm_offset_t vm_kernelcache_top;
138
139 /* Location of the physmap / physical aperture */
140 extern uint64_t physmap_base;
141
142 #if defined(CONFIG_SPTM)
143 extern const arm_physrange_t *arm_vm_kernelcache_ranges;
144 extern int arm_vm_kernelcache_numranges;
145 #else /* defined(CONFIG_SPTM) */
146 extern vm_offset_t arm_vm_kernelcache_phys_start;
147 extern vm_offset_t arm_vm_kernelcache_phys_end;
148 #endif /* defined(CONFIG_SPTM) */
149
150 #if defined(HAS_IPI)
151 unsigned int gFastIPI = 1;
152 #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
153 static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
154 kDeferredIPITimerDefault);
155 #endif /* defined(HAS_IPI) */
156
157 thread_t Idle_context(void);
158
159 SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
160
161 SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
162 SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
163 SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
164 .version = CPU_TOPOLOGY_VERSION,
165 .cpus = topology_cpu_array,
166 .clusters = topology_cluster_array,
167 };
168
169 _Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
170
171 /**
172 * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
173 * entries of an arbitrary data type. This is intended for use by specialized consumers
174 * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
175 * as follows:
176 * hypothetical_array[cluster_offsets[AFF1] + AFF0]
177 * Most consumers should instead use general-purpose facilities such as PERCPU or
178 * ml_get_cpu_number().
179 */
180 SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
181
182 SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
183
184 extern uint32_t lockdown_done;
185
186 /**
187 * Represents regions of virtual address space that should be reserved
188 * (pre-mapped) in each user address space.
189 */
190 static const struct vm_reserved_region vm_reserved_regions[] = {
191 {
192 .vmrr_name = "GPU Carveout",
193 .vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
194 .vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
195 },
196 /*
197 * Reserve the virtual memory space representing the commpage nesting region
198 * to prevent user processes from allocating memory within it. The actual
199 * page table entries for the commpage are inserted by vm_commpage_enter().
200 * This vm_map_enter() just prevents userspace from allocating/deallocating
201 * anything within the entire commpage nested region.
202 */
203 {
204 .vmrr_name = "commpage nesting",
205 .vmrr_addr = _COMM_PAGE64_NESTING_START,
206 .vmrr_size = _COMM_PAGE64_NESTING_SIZE
207 }
208 };
209
210 uint32_t get_arm_cpu_version(void);
211
212 #if HAS_MTE
213 static uint64_t arm_mte_random_rgsr_el1_seed(void);
214 #endif
215
216 #if defined(HAS_IPI)
217 static inline void
ml_cpu_signal_type(unsigned int cpu_mpidr,uint32_t type)218 ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
219 {
220 #if HAS_CLUSTER
221 uint64_t local_mpidr;
222 /* NOTE: this logic expects that we are called in a non-preemptible
223 * context, or at least one in which the calling thread is bound
224 * to a single CPU. Otherwise we may migrate between choosing which
225 * IPI mechanism to use and issuing the IPI. */
226 MRS(local_mpidr, "MPIDR_EL1");
227 if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
228 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
229 MSR("S3_5_C15_C0_0", x);
230 } else {
231 #define IPI_RR_TARGET_CLUSTER_SHIFT 16
232 uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
233 MSR("S3_5_C15_C0_1", x);
234 }
235 #else
236 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
237 MSR("S3_5_C15_C0_1", x);
238 #endif
239 /* The recommended local/global IPI sequence is:
240 * DSB <sys> (This ensures visibility of e.g. older stores to the
241 * pending CPU signals bit vector in DRAM prior to IPI reception,
242 * and is present in cpu_signal_internal())
243 * MSR S3_5_C15_C0_1, Xt
244 * ISB
245 */
246 __builtin_arm_isb(ISB_SY);
247 }
248 #endif
249
250 #if !defined(HAS_IPI)
251 __dead2
252 #endif
253 void
ml_cpu_signal(unsigned int cpu_mpidr __unused)254 ml_cpu_signal(unsigned int cpu_mpidr __unused)
255 {
256 #if defined(HAS_IPI)
257 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
258 #else
259 panic("Platform does not support ACC Fast IPI");
260 #endif
261 }
262
263 #if !defined(HAS_IPI)
264 __dead2
265 #endif
266 void
ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)267 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
268 {
269 #if defined(HAS_IPI)
270 /* adjust IPI_CR timer countdown value for deferred IPI
271 * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
272 * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
273 *
274 * global register, should only require a single write to update all
275 * CPU cores: from Skye ACC user spec section 5.7.3.3
276 *
277 * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
278 * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
279 */
280 uint64_t abstime;
281
282 nanoseconds_to_absolutetime(nanosecs, &abstime);
283
284 abstime = MIN(abstime, 0xFFFF);
285
286 /* update deferred_ipi_timer_ns with the new clamped value */
287 absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
288
289 MSR("S3_5_C15_C3_1", abstime);
290 #else
291 (void)nanosecs;
292 panic("Platform does not support ACC Fast IPI");
293 #endif
294 }
295
296 uint64_t
ml_cpu_signal_deferred_get_timer()297 ml_cpu_signal_deferred_get_timer()
298 {
299 #if defined(HAS_IPI)
300 return deferred_ipi_timer_ns;
301 #else
302 return 0;
303 #endif
304 }
305
306 #if !defined(HAS_IPI)
307 __dead2
308 #endif
309 void
ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)310 ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
311 {
312 #if defined(HAS_IPI)
313 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
314 #else
315 panic("Platform does not support ACC Fast IPI deferral");
316 #endif
317 }
318
319 #if !defined(HAS_IPI)
320 __dead2
321 #endif
322 void
ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)323 ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
324 {
325 #if defined(HAS_IPI)
326 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
327 #else
328 panic("Platform does not support ACC Fast IPI retraction");
329 #endif
330 }
331
332 extern uint32_t idle_proximate_io_wfe_unmasked;
333
334 #define CPUPM_IDLE_WFE 0x5310300
335 static bool
wfe_process_recommendation(void)336 wfe_process_recommendation(void)
337 {
338 bool ipending = false;
339 if (__probable(idle_proximate_io_wfe_unmasked == 1)) {
340 /* Check for an active perf. controller generated
341 * WFE recommendation for this cluster.
342 */
343 cpu_data_t *cdp = getCpuDatap();
344 uint32_t cid = cdp->cpu_cluster_id;
345 uint64_t wfe_ttd = 0;
346 uint64_t wfe_deadline = 0;
347
348 if ((wfe_ttd = ml_cluster_wfe_timeout(cid)) != 0) {
349 wfe_deadline = mach_absolute_time() + wfe_ttd;
350 }
351
352 if (wfe_deadline != 0) {
353 /* Poll issuing event-bounded WFEs until an interrupt
354 * arrives or the WFE recommendation expires
355 */
356 #if DEVELOPMENT || DEBUG
357 uint64_t wc = cdp->wfe_count;
358 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
359 #endif
360 /* Issue WFE until the recommendation expires,
361 * with IRQs unmasked.
362 */
363 ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true, true);
364 #if DEVELOPMENT || DEBUG
365 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
366 #endif
367 }
368 }
369 return ipending;
370 }
371
372 void
machine_idle(void)373 machine_idle(void)
374 {
375 /* Interrupts are expected to be masked on entry or re-entry via
376 * Idle_load_context()
377 */
378 assert((__builtin_arm_rsr("DAIF") & DAIF_STANDARD_DISABLE) == DAIF_STANDARD_DISABLE);
379 /* Check for, and act on, a WFE recommendation.
380 * Bypasses context spill/fill for a minor perf. increment.
381 * May unmask and restore IRQ+FIQ mask.
382 */
383 if (wfe_process_recommendation() == false) {
384 /* If WFE recommendation absent, or WFE deadline
385 * arrived with no interrupt pending/processed,
386 * fall back to WFI.
387 */
388 Idle_context();
389 }
390 __builtin_arm_wsr("DAIFClr", DAIFSC_STANDARD_DISABLE);
391 }
392
393 void
OSSynchronizeIO(void)394 OSSynchronizeIO(void)
395 {
396 __builtin_arm_dsb(DSB_SY);
397 }
398
399 uint64_t
get_aux_control(void)400 get_aux_control(void)
401 {
402 uint64_t value;
403
404 MRS(value, "ACTLR_EL1");
405 return value;
406 }
407
408 uint64_t
get_mmu_control(void)409 get_mmu_control(void)
410 {
411 uint64_t value;
412
413 MRS(value, "SCTLR_EL1");
414 return value;
415 }
416
417 uint64_t
get_tcr(void)418 get_tcr(void)
419 {
420 uint64_t value;
421
422 MRS(value, "TCR_EL1");
423 return value;
424 }
425
426 __mockable boolean_t
ml_get_interrupts_enabled(void)427 ml_get_interrupts_enabled(void)
428 {
429 uint64_t value;
430
431 MRS(value, "DAIF");
432 if ((value & DAIF_STANDARD_DISABLE) == DAIF_STANDARD_DISABLE) {
433 return FALSE;
434 }
435 return TRUE;
436 }
437
438 pmap_paddr_t
get_mmu_ttb(void)439 get_mmu_ttb(void)
440 {
441 pmap_paddr_t value;
442
443 MRS(value, "TTBR0_EL1");
444 return value;
445 }
446
447 MARK_AS_FIXUP_TEXT uint32_t
get_arm_cpu_version(void)448 get_arm_cpu_version(void)
449 {
450 uint32_t value = machine_read_midr();
451
452 /* Compose the register values into 8 bits; variant[7:4], revision[3:0]. */
453 return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
454 }
455
456 bool
ml_feature_supported(uint64_t feature_bit)457 ml_feature_supported(uint64_t feature_bit)
458 {
459 uint64_t aidr_el1_value = 0;
460
461 MRS(aidr_el1_value, "AIDR_EL1");
462
463 #ifdef APPLEAVALANCHE
464 #endif // APPLEAVALANCHE
465
466 return aidr_el1_value & feature_bit;
467 }
468
469 /*
470 * user_cont_hwclock_allowed()
471 *
472 * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
473 * as a continuous time source (e.g. from mach_continuous_time)
474 */
475 boolean_t
user_cont_hwclock_allowed(void)476 user_cont_hwclock_allowed(void)
477 {
478 #if HAS_CONTINUOUS_HWCLOCK
479 return TRUE;
480 #else
481 return FALSE;
482 #endif
483 }
484
485 /*
486 * user_timebase_type()
487 *
488 * Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
489 *
490 * USER_TIMEBASE_NONE: EL0 has no access to timebase register
491 * USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
492 * USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
493 *
494 */
495
496 uint8_t
user_timebase_type(void)497 user_timebase_type(void)
498 {
499 #if HAS_ACNTVCT
500 return USER_TIMEBASE_NOSPEC_APPLE;
501 #elif HAS_APPLE_GENERIC_TIMER
502 // Conveniently, S3_4_C15_C10_6 and ACNTVCT_EL0 have identical encodings
503 return USER_TIMEBASE_NOSPEC_APPLE;
504 #elif __ARM_ARCH_8_6__
505 return USER_TIMEBASE_NOSPEC;
506 #else
507 return USER_TIMEBASE_SPEC;
508 #endif
509 }
510
511 void
machine_startup(__unused boot_args * args)512 machine_startup(__unused boot_args * args)
513 {
514 #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
515 if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
516 gFastIPI = 1;
517 }
518 #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
519
520
521 machine_conf();
522
523
524 /*
525 * Kick off the kernel bootstrap.
526 */
527 kernel_bootstrap();
528 /* NOTREACHED */
529 }
530
531 typedef void (*invalidate_fn_t)(void);
532
533 static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
534
535 void set_invalidate_hmac_function(invalidate_fn_t fn);
536
537 void
set_invalidate_hmac_function(invalidate_fn_t fn)538 set_invalidate_hmac_function(invalidate_fn_t fn)
539 {
540 if (NULL != invalidate_hmac_function) {
541 panic("Invalidate HMAC function already set");
542 }
543
544 invalidate_hmac_function = fn;
545 }
546
547 bool
ml_is_secure_hib_supported(void)548 ml_is_secure_hib_supported(void)
549 {
550 return false;
551 }
552
553 static void ml_release_deferred_pages(void);
554
555 void
machine_lockdown(void)556 machine_lockdown(void)
557 {
558
559 #if CONFIG_SPTM
560
561 /**
562 * On devices that make use of the SPTM, the SPTM is responsible for
563 * managing system register locks. Due to this, we skip the call to
564 * spr_lockdown() below.
565 */
566 #else
567 #endif
568
569 arm_vm_prot_finalize(PE_state.bootArgs);
570 ml_release_deferred_pages();
571
572 #if CONFIG_KERNEL_INTEGRITY
573 #if KERNEL_INTEGRITY_WT
574 /* Watchtower
575 *
576 * Notify the monitor about the completion of early kernel bootstrap.
577 * From this point forward it will enforce the integrity of kernel text,
578 * rodata and page tables.
579 */
580
581 #ifdef MONITOR
582 monitor_call(MONITOR_LOCKDOWN, 0, 0, 0);
583 #endif
584 #endif /* KERNEL_INTEGRITY_WT */
585
586 #if CONFIG_SPTM
587 extern void pmap_prepare_commpages(void);
588 pmap_prepare_commpages();
589
590 /**
591 * sptm_lockdown_xnu() disables preemption like all SPTM calls, but may take
592 * a fair amount of time as it involves retyping a large number of pages.
593 * This preemption latency is not really a concern since we're still fairly
594 * early in the boot process, so just explicitly disable preemption before
595 * invoking the SPTM and abandon preemption latency measurements before
596 * re-enabling it.
597 */
598 disable_preemption();
599 /* Signal the SPTM that XNU is ready for RO memory to actually become read-only */
600 sptm_lockdown_xnu();
601 #if SCHED_HYGIENE_DEBUG
602 abandon_preemption_disable_measurement();
603 #endif /* SCHED_HYGIENE_DEBUG */
604 enable_preemption();
605 #else
606 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
607 /* KTRR
608 *
609 * Lock physical KTRR region. KTRR region is read-only. Memory outside
610 * the region is not executable at EL1.
611 */
612
613 rorgn_lockdown();
614 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
615 #endif /* CONFIG_SPTM */
616
617 #if XNU_MONITOR
618 pmap_lockdown_ppl();
619 #endif
620
621 #endif /* CONFIG_KERNEL_INTEGRITY */
622
623
624 /**
625 * For platforms that use SEP-backed hibernation, invoke kext-provided
626 * functionality to invalidate HMAC key in SIO used to sign a variety of
627 * data (e.g., the RO region).
628 *
629 * Just for paranoia's sake, let's make it so that if an attacker is
630 * capable of corrupting EDT early that they have to do so in a way that
631 * prevents invaldidate_hmac_function from running properly yet still
632 * makes it so that the invalidate HMAC function receives an OK
633 * response, which seems hard.
634 *
635 * This only makes sense for PPL-based systems seeing as SPTM-based systems
636 * will have iBoot invalidate Key1 for us.
637 */
638 if (NULL != invalidate_hmac_function) {
639 #if !defined(CONFIG_SPTM)
640 invalidate_hmac_function();
641 #endif /* !defined(CONFIG_SPTM) */
642 }
643
644 lockdown_done = 1;
645 }
646
647
648 char *
machine_boot_info(__unused char * buf,__unused vm_size_t size)649 machine_boot_info(
650 __unused char *buf,
651 __unused vm_size_t size)
652 {
653 return PE_boot_args();
654 }
655
656 void
machine_cpu_reinit(__unused void * param)657 machine_cpu_reinit(__unused void *param)
658 {
659 cpu_machine_init(); /* Initialize the processor */
660 clock_init(); /* Init the clock */
661 }
662
663 /*
664 * Routine: machine_processor_shutdown
665 * Function:
666 */
667 thread_t
machine_processor_shutdown(__unused thread_t thread,void (* doshutdown)(processor_t),processor_t processor)668 machine_processor_shutdown(
669 __unused thread_t thread,
670 void (*doshutdown)(processor_t),
671 processor_t processor)
672 {
673 return Shutdown_context(doshutdown, processor);
674 }
675
676 /*
677 * Routine: ml_init_lock_timeout
678 * Function:
679 */
680 static void __startup_func
ml_init_lock_timeout(void)681 ml_init_lock_timeout(void)
682 {
683 /*
684 * This function is called after STARTUP_SUB_TIMEOUTS
685 * initialization, so using the "legacy" boot-args here overrides
686 * the ml-timeout-... configuration. (Given that these boot-args
687 * here are usually explicitly specified, this makes sense by
688 * overriding ml-timeout-..., which may come from the device tree.
689 */
690
691 uint64_t lto_timeout_ns;
692 uint64_t lto_abstime;
693 uint32_t slto;
694
695 if (PE_parse_boot_argn("slto_us", &slto, sizeof(slto))) {
696 lto_timeout_ns = slto * NSEC_PER_USEC;
697 nanoseconds_to_absolutetime(lto_timeout_ns, <o_abstime);
698 os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
699 } else {
700 lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
701 absolutetime_to_nanoseconds(lto_abstime, <o_timeout_ns);
702 }
703
704 os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
705
706 if (PE_parse_boot_argn("tlto_us", &slto, sizeof(slto))) {
707 nanoseconds_to_absolutetime(slto * NSEC_PER_USEC, <o_abstime);
708 os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
709 } else if (lto_abstime != 0) {
710 os_atomic_store(&TLockTimeOut, lto_abstime >> 1, relaxed);
711 } // else take default from MACHINE_TIMEOUT.
712
713 uint64_t mtxspin;
714 uint64_t mtx_abstime;
715 if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof(mtxspin))) {
716 if (mtxspin > USEC_PER_SEC >> 4) {
717 mtxspin = USEC_PER_SEC >> 4;
718 }
719 nanoseconds_to_absolutetime(mtxspin * NSEC_PER_USEC, &mtx_abstime);
720 os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
721 } else {
722 mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
723 }
724
725 low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
726 /*
727 * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
728 * real_ncpus is not set at this time
729 *
730 * NOTE: active spinning is disabled in arm. It can be activated
731 * by setting high_MutexSpin through the sysctl.
732 */
733 high_MutexSpin = low_MutexSpin;
734
735 uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
736 PE_parse_boot_argn("max_wfe_us", &maxwfeus, sizeof(maxwfeus));
737 nanoseconds_to_absolutetime(maxwfeus * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
738 }
739 STARTUP(TIMEOUTS, STARTUP_RANK_MIDDLE, ml_init_lock_timeout);
740
741
742 /*
743 * This is called when all of the ml_processor_info_t structures have been
744 * initialized and all the processors have been started through processor_boot().
745 *
746 * Required by the scheduler subsystem.
747 */
748 void
ml_cpu_init_completed(void)749 ml_cpu_init_completed(void)
750 {
751 sched_cpu_init_completed();
752 }
753
754 /*
755 * This tracks which cpus are between ml_cpu_down and ml_cpu_up
756 */
757 _Atomic uint64_t ml_cpu_up_processors = 0;
758
759 void
ml_cpu_up(void)760 ml_cpu_up(void)
761 {
762 cpu_data_t *cpu_data_ptr = getCpuDatap();
763
764 assert(!bit_test(os_atomic_load(&ml_cpu_up_processors, relaxed), cpu_data_ptr->cpu_number));
765
766 atomic_bit_set(&ml_cpu_up_processors, cpu_data_ptr->cpu_number, memory_order_relaxed);
767 }
768
769 /*
770 * These are called from the machine-independent routine cpu_up()
771 * to perform machine-dependent info updates.
772 *
773 * The update to CPU counts needs to be separate from other actions
774 * because we don't update the counts when CLPC causes temporary
775 * cluster powerdown events, as these must be transparent to the user.
776 */
777
778 void
ml_cpu_up_update_counts(int cpu_id)779 ml_cpu_up_update_counts(int cpu_id)
780 {
781 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
782
783 os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
784
785 os_atomic_inc(&machine_info.physical_cpu, relaxed);
786 os_atomic_inc(&machine_info.logical_cpu, relaxed);
787 }
788
789 int
ml_find_next_up_processor()790 ml_find_next_up_processor()
791 {
792 if (BootCpuData.cpu_running) {
793 return BootCpuData.cpu_number;
794 }
795
796 int next_active_cpu = lsb_first(os_atomic_load(&ml_cpu_up_processors, relaxed));
797
798 if (next_active_cpu == -1) {
799 assertf(ml_is_quiescing(), "can only have no active CPUs in quiesce state");
800 next_active_cpu = BootCpuData.cpu_number;
801 }
802
803 return next_active_cpu;
804 }
805
806 /*
807 * These are called from the machine-independent routine cpu_down()
808 * to perform machine-dependent info updates.
809 *
810 * The update to CPU counts needs to be separate from other actions
811 * because we don't update the counts when CLPC causes temporary
812 * cluster powerdown events, as these must be transparent to the user.
813 */
814 void
ml_cpu_down(void)815 ml_cpu_down(void)
816 {
817 /*
818 * If we want to deal with outstanding IPIs, we need to
819 * do relatively early in the processor_doshutdown path,
820 * as we pend decrementer interrupts using the IPI
821 * mechanism if we cannot immediately service them (if
822 * IRQ is masked). Do so now.
823 *
824 * We aren't on the interrupt stack here; would it make
825 * more sense to disable signaling and then enable
826 * interrupts? It might be a bit cleaner.
827 */
828 cpu_data_t *cpu_data_ptr = getCpuDatap();
829 cpu_data_ptr->cpu_running = FALSE;
830
831 assert((cpu_data_ptr->cpu_signal & SIGPdisabled) == 0);
832 assert(bit_test(os_atomic_load(&ml_cpu_up_processors, relaxed), cpu_data_ptr->cpu_number));
833
834 atomic_bit_clear(&ml_cpu_up_processors, cpu_data_ptr->cpu_number, memory_order_release);
835
836 if (cpu_data_ptr == &BootCpuData && ml_is_quiescing()) {
837 /*
838 * This is the boot CPU powering down for S2R, don't try to migrate its timers,
839 * because there is nobody else active to migrate it to.
840 */
841 assert3u(os_atomic_load(&ml_cpu_up_processors, relaxed), ==, 0);
842 } else if (cpu_data_ptr != &BootCpuData || (support_bootcpu_shutdown && !ml_is_quiescing())) {
843 int next_cpu = ml_find_next_up_processor();
844
845 cpu_data_t* new_cpu_datap = cpu_datap(next_cpu);
846
847 /*
848 * Move all of this cpu's timers to another cpu that has not gone through ml_cpu_down,
849 * and poke it in case there's a sooner deadline for it to schedule.
850 *
851 * This depends on ml_cpu_down never running concurrently, which is guaranteed by
852 * the processor_updown_lock.
853 */
854 timer_queue_shutdown(next_cpu, &cpu_data_ptr->rtclock_timer.queue,
855 &new_cpu_datap->rtclock_timer.queue);
856
857 /*
858 * Trigger timer_queue_expire_local to execute on the remote CPU.
859 *
860 * Because we have interrupts disabled here, we cannot use a
861 * standard cpu_xcall, which would deadlock against the stackshot
862 * IPI. This must be a fire-and-forget IPI.
863 */
864 kern_return_t rv = cpu_signal(new_cpu_datap, SIGPTimerLocal, NULL, NULL);
865
866 if (rv != KERN_SUCCESS) {
867 panic("ml_cpu_down: cpu_signal of cpu %d failure %d", next_cpu, rv);
868 }
869 } else {
870 panic("boot cpu powering down with nowhere for its timers to go");
871 }
872
873 cpu_signal_handler_internal(TRUE);
874
875 /* There should be no more pending IPIs on this core. */
876 assert3u(getCpuDatap()->cpu_signal, ==, SIGPdisabled);
877 }
878
879 void
ml_cpu_down_update_counts(int cpu_id)880 ml_cpu_down_update_counts(int cpu_id)
881 {
882 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
883
884 os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
885
886 os_atomic_dec(&machine_info.physical_cpu, relaxed);
887 os_atomic_dec(&machine_info.logical_cpu, relaxed);
888 }
889
890
891 unsigned int
ml_get_machine_mem(void)892 ml_get_machine_mem(void)
893 {
894 return machine_info.memory_size;
895 }
896
897 __attribute__((noreturn))
898 void
halt_all_cpus(boolean_t reboot)899 halt_all_cpus(boolean_t reboot)
900 {
901 if (reboot) {
902 printf("MACH Reboot\n");
903 PEHaltRestart(kPERestartCPU);
904 } else {
905 printf("CPU halted\n");
906 PEHaltRestart(kPEHaltCPU);
907 }
908 while (1) {
909 ;
910 }
911 }
912
913 __attribute__((noreturn))
914 void
halt_cpu(void)915 halt_cpu(void)
916 {
917 halt_all_cpus(FALSE);
918 }
919
920 /*
921 * Routine: machine_signal_idle
922 * Function:
923 */
924 void
machine_signal_idle(processor_t processor)925 machine_signal_idle(
926 processor_t processor)
927 {
928 cpu_signal(processor_to_cpu_datap(processor), SIGPnop, (void *)NULL, (void *)NULL);
929 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
930 }
931
932 void
machine_signal_idle_deferred(processor_t processor)933 machine_signal_idle_deferred(
934 processor_t processor)
935 {
936 cpu_signal_deferred(processor_to_cpu_datap(processor), SIGPdeferred);
937 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
938 }
939
940 void
machine_signal_idle_cancel(processor_t processor)941 machine_signal_idle_cancel(
942 processor_t processor)
943 {
944 cpu_signal_cancel(processor_to_cpu_datap(processor), SIGPdeferred);
945 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
946 }
947
948 /*
949 * Routine: ml_install_interrupt_handler
950 * Function: Initialize Interrupt Handler
951 */
952 void
ml_install_interrupt_handler(void * nub,int source,void * target,IOInterruptHandler handler,void * refCon)953 ml_install_interrupt_handler(
954 void *nub,
955 int source,
956 void *target,
957 IOInterruptHandler handler,
958 void *refCon)
959 {
960 cpu_data_t *cpu_data_ptr;
961 boolean_t current_state;
962
963 current_state = ml_set_interrupts_enabled(FALSE);
964 cpu_data_ptr = getCpuDatap();
965
966 cpu_data_ptr->interrupt_nub = nub;
967 cpu_data_ptr->interrupt_source = source;
968 cpu_data_ptr->interrupt_target = target;
969 cpu_data_ptr->interrupt_handler = handler;
970 cpu_data_ptr->interrupt_refCon = refCon;
971
972 (void) ml_set_interrupts_enabled(current_state);
973 }
974
975 /*
976 * Routine: ml_init_interrupt
977 * Function: Initialize Interrupts
978 */
979 void
ml_init_interrupt(void)980 ml_init_interrupt(void)
981 {
982 #if defined(HAS_IPI)
983 /*
984 * ml_init_interrupt will get called once for each CPU, but this is redundant
985 * because there is only one global copy of the register for skye. do it only
986 * on the bootstrap cpu
987 */
988 if (getCpuDatap()->cluster_master) {
989 ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
990 }
991 #endif
992 }
993
994 /*
995 * Routine: ml_init_timebase
996 * Function: register and setup Timebase, Decremeter services
997 */
998 void
ml_init_timebase(void * args,tbd_ops_t tbd_funcs,vm_offset_t int_address,vm_offset_t int_value __unused)999 ml_init_timebase(
1000 void *args,
1001 tbd_ops_t tbd_funcs,
1002 vm_offset_t int_address,
1003 vm_offset_t int_value __unused)
1004 {
1005 cpu_data_t *cpu_data_ptr;
1006
1007 cpu_data_ptr = (cpu_data_t *)args;
1008
1009 if ((cpu_data_ptr == &BootCpuData)
1010 && (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
1011 rtclock_timebase_func = *tbd_funcs;
1012 rtclock_timebase_addr = int_address;
1013 }
1014 }
1015
1016 #define ML_READPROP_MANDATORY UINT64_MAX
1017
1018 static uint64_t
ml_readprop(const DTEntry entry,const char * propertyName,uint64_t default_value)1019 ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
1020 {
1021 void const *prop;
1022 unsigned int propSize;
1023
1024 if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) {
1025 if (propSize == sizeof(uint8_t)) {
1026 return *((uint8_t const *)prop);
1027 } else if (propSize == sizeof(uint16_t)) {
1028 return *((uint16_t const *)prop);
1029 } else if (propSize == sizeof(uint32_t)) {
1030 return *((uint32_t const *)prop);
1031 } else if (propSize == sizeof(uint64_t)) {
1032 return *((uint64_t const *)prop);
1033 } else {
1034 panic("CPU property '%s' has bad size %u", propertyName, propSize);
1035 }
1036 } else {
1037 if (default_value == ML_READPROP_MANDATORY) {
1038 panic("Missing mandatory property '%s'", propertyName);
1039 }
1040 return default_value;
1041 }
1042 }
1043
1044 static boolean_t
ml_read_reg_range(const DTEntry entry,const char * propertyName,uint64_t * pa_ptr,uint64_t * len_ptr)1045 ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
1046 {
1047 uint64_t const *prop;
1048 unsigned int propSize;
1049
1050 if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) {
1051 return FALSE;
1052 }
1053
1054 if (propSize != sizeof(uint64_t) * 2) {
1055 panic("Wrong property size for %s", propertyName);
1056 }
1057
1058 *pa_ptr = prop[0];
1059 *len_ptr = prop[1];
1060 return TRUE;
1061 }
1062
1063 static boolean_t
ml_is_boot_cpu(const DTEntry entry)1064 ml_is_boot_cpu(const DTEntry entry)
1065 {
1066 void const *prop;
1067 unsigned int propSize;
1068
1069 if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) {
1070 panic("unable to retrieve state for cpu");
1071 }
1072
1073 if (strncmp((char const *)prop, "running", propSize) == 0) {
1074 return TRUE;
1075 } else {
1076 return FALSE;
1077 }
1078 }
1079
1080 static void
ml_cluster_power_override(unsigned int * flag)1081 ml_cluster_power_override(unsigned int *flag)
1082 {
1083 #if XNU_CLUSTER_POWER_DOWN
1084 /*
1085 * Old method (H14/H15): enable CPD in the kernel build
1086 * For H16+, *flag may have be set to 1 through EDT
1087 */
1088 *flag = 1;
1089 #endif
1090
1091 /*
1092 * If a boot-arg is set that allows threads to be bound
1093 * to a cpu or cluster, cluster_power_down must
1094 * default to false.
1095 */
1096 #ifdef CONFIG_XNUPOST
1097 uint64_t kernel_post = 0;
1098 PE_parse_boot_argn("kernPOST", &kernel_post, sizeof(kernel_post));
1099 if (kernel_post != 0) {
1100 *flag = 0;
1101 }
1102 #endif
1103 if (PE_parse_boot_argn("enable_skstb", NULL, 0)) {
1104 *flag = 0;
1105 }
1106 if (PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
1107 *flag = 0;
1108 }
1109
1110 /* Always let the user manually override, even if it's unsupported */
1111 PE_parse_boot_argn("cluster_power", flag, sizeof(*flag));
1112 }
1113
1114
1115 static void
ml_read_chip_revision(unsigned int * rev __unused)1116 ml_read_chip_revision(unsigned int *rev __unused)
1117 {
1118 // The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
1119 #ifdef APPLE_ARM64_ARCH_FAMILY
1120 DTEntry entryP;
1121
1122 if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) {
1123 *rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN);
1124 } else {
1125 *rev = CPU_VERSION_UNKNOWN;
1126 }
1127 #endif
1128 }
1129
1130 void
ml_parse_cpu_topology(void)1131 ml_parse_cpu_topology(void)
1132 {
1133 DTEntry entry, child __unused;
1134 OpaqueDTEntryIterator iter;
1135 uint32_t cpu_boot_arg = MAX_CPUS;
1136 uint64_t cpumask_boot_arg = ULLONG_MAX;
1137 int err;
1138
1139 int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
1140 int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
1141 const boolean_t cpus_boot_arg_present = PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg));
1142 const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn("cpumask", &cpumask_boot_arg, sizeof(cpumask_boot_arg));
1143
1144 // The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
1145 // so that we trigger a panic later in the boot process, once serial is enabled.
1146 if (cpus_boot_arg_present && cpumask_boot_arg_present) {
1147 cpu_config_correct = false;
1148 }
1149
1150 err = SecureDTLookupEntry(NULL, "/cpus", &entry);
1151 assert(err == kSuccess);
1152
1153 err = SecureDTInitEntryIterator(entry, &iter);
1154 assert(err == kSuccess);
1155
1156 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1157 cluster_offsets[i] = -1;
1158 cluster_phys_to_logical[i] = -1;
1159 cluster_max_cpu_phys_id[i] = 0;
1160 }
1161
1162 while (kSuccess == SecureDTIterateEntries(&iter, &child)) {
1163 boolean_t is_boot_cpu = ml_is_boot_cpu(child);
1164 boolean_t cpu_enabled = cpumask_boot_arg & 1;
1165 cpumask_boot_arg >>= 1;
1166
1167 // Boot CPU disabled in cpumask. Flag this so that we trigger a panic
1168 // later in the boot process, once serial is enabled.
1169 if (is_boot_cpu && !cpu_enabled) {
1170 cpu_config_correct = false;
1171 }
1172
1173 // Ignore this CPU if it has been disabled by the cpumask= boot-arg.
1174 if (!is_boot_cpu && !cpu_enabled) {
1175 continue;
1176 }
1177
1178 // If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
1179 // been added to the topology struct yet, and we only have one slot left, then skip
1180 // every other non-boot CPU in order to leave room for the boot CPU.
1181 //
1182 // e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
1183 // array will list CPU0, CPU1, and CPU4. CPU2-CPU3 and CPU5-CPUn will be omitted.
1184 if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
1185 continue;
1186 }
1187 if (topology_info.num_cpus >= cpu_boot_arg) {
1188 break;
1189 }
1190
1191 ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
1192
1193 cpu->cpu_id = topology_info.num_cpus++;
1194 assert(cpu->cpu_id < MAX_CPUS);
1195 topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
1196
1197 cpu->die_id = (int)ml_readprop(child, "die-id", 0);
1198 topology_info.max_die_id = MAX(topology_info.max_die_id, cpu->die_id);
1199
1200 cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY);
1201
1202 cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0);
1203 cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0);
1204 cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
1205 cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
1206
1207 ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
1208 ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
1209 ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
1210 cpu->cluster_type = CLUSTER_TYPE_SMP;
1211
1212 int cluster_type = (int)ml_readprop(child, "cluster-type", 0);
1213 if (cluster_type == 'E') {
1214 cpu->cluster_type = CLUSTER_TYPE_E;
1215 } else if (cluster_type == 'P') {
1216 cpu->cluster_type = CLUSTER_TYPE_P;
1217 }
1218
1219 if (ml_readprop(child, "cluster-power-down", 0)) {
1220 topology_info.cluster_power_down = 1;
1221 }
1222
1223 topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1224
1225 /*
1226 * Since we want to keep a linear cluster ID space, we cannot just rely
1227 * on the value provided by EDT. Instead, use the MPIDR value to see if we have
1228 * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1229 */
1230 #if HAS_CLUSTER
1231 uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1232 #else
1233 uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1234 #endif
1235 assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1236 cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
1237 topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1238
1239 assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1240
1241 ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1242 if (cluster->num_cpus == 0) {
1243 assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1244
1245 topology_info.num_clusters++;
1246 topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1247 topology_info.cluster_types |= (1 << cpu->cluster_type);
1248
1249 cluster->cluster_id = cpu->cluster_id;
1250 cluster->die_id = cpu->die_id;
1251 cluster->cluster_type = cpu->cluster_type;
1252 cluster->first_cpu_id = cpu->cpu_id;
1253 assert(cluster_phys_to_logical[phys_cluster_id] == -1);
1254 cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1255
1256 topology_info.cluster_type_num_clusters[cluster->cluster_type]++;
1257
1258 // Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1259 // If we wind up with a bunch of these, we might want to create separate per-cluster
1260 // EDT nodes and have the CPU nodes reference them through a phandle.
1261 ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len);
1262 ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len);
1263 }
1264
1265 #if HAS_CLUSTER
1266 if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1267 cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1268 }
1269 #endif
1270
1271 cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1272 cluster->die_cluster_id = cpu->die_cluster_id;
1273
1274 cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1275
1276 cluster->num_cpus++;
1277 cluster->cpu_mask |= 1ULL << cpu->cpu_id;
1278
1279 if (is_boot_cpu) {
1280 assert(topology_info.boot_cpu == NULL);
1281 topology_info.boot_cpu = cpu;
1282 topology_info.boot_cluster = cluster;
1283 }
1284
1285 #if CONFIG_SPTM
1286 sptm_register_cpu(cpu->phys_id);
1287 #endif
1288 }
1289
1290 #if HAS_CLUSTER
1291 /*
1292 * Build the cluster offset array, ensuring that the region reserved
1293 * for each physical cluster contains enough entries to be indexed
1294 * by the maximum physical CPU ID (AFF0) within the cluster.
1295 */
1296 unsigned int cur_cluster_offset = 0;
1297 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1298 if (cluster_phys_to_logical[i] != -1) {
1299 cluster_offsets[i] = cur_cluster_offset;
1300 cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
1301 }
1302 }
1303 assert(cur_cluster_offset <= MAX_CPUS);
1304 #else
1305 /*
1306 * For H10, there are really 2 physical clusters, but they are not separated
1307 * into distinct ACCs. AFF1 therefore always reports 0, and AFF0 numbering
1308 * is linear across both clusters. For the purpose of MPIDR_EL1-based indexing,
1309 * treat H10 and earlier devices as though they contain a single cluster.
1310 */
1311 cluster_offsets[0] = 0;
1312 #endif
1313 assert(topology_info.boot_cpu != NULL);
1314 ml_read_chip_revision(&topology_info.chip_revision);
1315 ml_cluster_power_override(&topology_info.cluster_power_down);
1316
1317 /*
1318 * Set TPIDR_EL0 to indicate the correct cpu number & cluster id,
1319 * as we may not be booting from cpu 0. Userspace will consume
1320 * the current CPU number through this register. For non-boot
1321 * cores, this is done in start.s (start_cpu) using the per-cpu
1322 * data object.
1323 */
1324 ml_topology_cpu_t *boot_cpu = topology_info.boot_cpu;
1325 uint64_t tpidr_el0 = ((boot_cpu->cpu_id << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) | \
1326 ((boot_cpu->cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1327 assert(((tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == boot_cpu->cpu_id);
1328 assert(((tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == boot_cpu->cluster_id);
1329 __builtin_arm_wsr64("TPIDR_EL0", tpidr_el0);
1330
1331 __builtin_arm_wsr64("TPIDRRO_EL0", 0);
1332 }
1333
1334 const ml_topology_info_t *
ml_get_topology_info(void)1335 ml_get_topology_info(void)
1336 {
1337 return &topology_info;
1338 }
1339
1340 void
ml_map_cpu_pio(void)1341 ml_map_cpu_pio(void)
1342 {
1343 unsigned int i;
1344
1345 for (i = 0; i < topology_info.num_cpus; i++) {
1346 ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1347 if (cpu->cpu_IMPL_pa) {
1348 cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len);
1349 cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len);
1350 }
1351 if (cpu->cpu_UTTDBG_pa) {
1352 cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len);
1353 }
1354 }
1355
1356 for (i = 0; i < topology_info.num_clusters; i++) {
1357 ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1358 if (cluster->acc_IMPL_pa) {
1359 cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len);
1360 }
1361 if (cluster->cpm_IMPL_pa) {
1362 cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len);
1363 }
1364 }
1365 }
1366
1367 __mockable unsigned int
ml_get_cpu_count(void)1368 ml_get_cpu_count(void)
1369 {
1370 return topology_info.num_cpus;
1371 }
1372
1373 unsigned int
ml_get_cluster_count(void)1374 ml_get_cluster_count(void)
1375 {
1376 return topology_info.num_clusters;
1377 }
1378
1379 int
ml_get_boot_cpu_number(void)1380 ml_get_boot_cpu_number(void)
1381 {
1382 return topology_info.boot_cpu->cpu_id;
1383 }
1384
1385 cluster_type_t
ml_get_boot_cluster_type(void)1386 ml_get_boot_cluster_type(void)
1387 {
1388 return topology_info.boot_cluster->cluster_type;
1389 }
1390
1391 int
ml_get_cpu_number(uint32_t phys_id)1392 ml_get_cpu_number(uint32_t phys_id)
1393 {
1394 phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
1395
1396 for (unsigned i = 0; i < topology_info.num_cpus; i++) {
1397 if (topology_info.cpus[i].phys_id == phys_id) {
1398 return i;
1399 }
1400 }
1401
1402 return -1;
1403 }
1404
1405 int
ml_get_cluster_number(uint32_t phys_id)1406 ml_get_cluster_number(uint32_t phys_id)
1407 {
1408 int cpu_id = ml_get_cpu_number(phys_id);
1409 if (cpu_id < 0) {
1410 return -1;
1411 }
1412
1413 ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1414
1415 return cpu->cluster_id;
1416 }
1417
1418 unsigned int
ml_get_cpu_number_local(void)1419 ml_get_cpu_number_local(void)
1420 {
1421 uint64_t mpidr_el1_value = 0;
1422 unsigned cpu_id;
1423
1424 /* We identify the CPU based on the constant bits of MPIDR_EL1. */
1425 MRS(mpidr_el1_value, "MPIDR_EL1");
1426 cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value);
1427
1428 assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1429
1430 return cpu_id;
1431 }
1432
1433 int
ml_get_cluster_number_local()1434 ml_get_cluster_number_local()
1435 {
1436 uint64_t mpidr_el1_value = 0;
1437 unsigned cluster_id;
1438
1439 /* We identify the cluster based on the constant bits of MPIDR_EL1. */
1440 MRS(mpidr_el1_value, "MPIDR_EL1");
1441 cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value);
1442
1443 assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1444
1445 return cluster_id;
1446 }
1447
1448 int
ml_get_max_cpu_number(void)1449 ml_get_max_cpu_number(void)
1450 {
1451 return topology_info.max_cpu_id;
1452 }
1453
1454 int
ml_get_max_cluster_number(void)1455 ml_get_max_cluster_number(void)
1456 {
1457 return topology_info.max_cluster_id;
1458 }
1459
1460 unsigned int
ml_get_first_cpu_id(unsigned int cluster_id)1461 ml_get_first_cpu_id(unsigned int cluster_id)
1462 {
1463 return topology_info.clusters[cluster_id].first_cpu_id;
1464 }
1465
1466 static_assert(MAX_CPUS <= 256, "MAX_CPUS must fit in _COMM_PAGE_CPU_TO_CLUSTER; Increase table size if needed");
1467
1468 void
ml_map_cpus_to_clusters(uint8_t * table)1469 ml_map_cpus_to_clusters(uint8_t *table)
1470 {
1471 for (uint16_t cpu_id = 0; cpu_id < topology_info.num_cpus; cpu_id++) {
1472 *(table + cpu_id) = (uint8_t)(topology_info.cpus[cpu_id].cluster_id);
1473 }
1474 }
1475
1476 /*
1477 * Return the die id of a cluster.
1478 */
1479 unsigned int
ml_get_die_id(unsigned int cluster_id)1480 ml_get_die_id(unsigned int cluster_id)
1481 {
1482 /*
1483 * The current implementation gets the die_id from the
1484 * first CPU of the cluster.
1485 * rdar://80917654 (Add the die_id field to the cluster topology info)
1486 */
1487 unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1488 return topology_info.cpus[first_cpu].die_id;
1489 }
1490
1491 /*
1492 * Return the index of a cluster in its die.
1493 */
1494 unsigned int
ml_get_die_cluster_id(unsigned int cluster_id)1495 ml_get_die_cluster_id(unsigned int cluster_id)
1496 {
1497 /*
1498 * The current implementation gets the die_id from the
1499 * first CPU of the cluster.
1500 * rdar://80917654 (Add the die_id field to the cluster topology info)
1501 */
1502 unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1503 return topology_info.cpus[first_cpu].die_cluster_id;
1504 }
1505
1506 /*
1507 * Return the highest die id of the system.
1508 */
1509 unsigned int
ml_get_max_die_id(void)1510 ml_get_max_die_id(void)
1511 {
1512 return topology_info.max_die_id;
1513 }
1514
1515 void
ml_lockdown_init()1516 ml_lockdown_init()
1517 {
1518 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
1519 rorgn_stash_range();
1520 #endif
1521 }
1522
1523 kern_return_t
ml_lockdown_handler_register(lockdown_handler_t f,void * this)1524 ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1525 {
1526 if (!f) {
1527 return KERN_FAILURE;
1528 }
1529
1530 assert(lockdown_done);
1531 f(this); // XXX: f this whole function
1532
1533 return KERN_SUCCESS;
1534 }
1535
1536 static mcache_flush_function mcache_flush_func;
1537 static void* mcache_flush_service;
1538 kern_return_t
ml_mcache_flush_callback_register(mcache_flush_function func,void * service)1539 ml_mcache_flush_callback_register(mcache_flush_function func, void *service)
1540 {
1541 mcache_flush_service = service;
1542 mcache_flush_func = func;
1543
1544 return KERN_SUCCESS;
1545 }
1546
1547 kern_return_t
ml_mcache_flush(void)1548 ml_mcache_flush(void)
1549 {
1550 if (!mcache_flush_func) {
1551 panic("Cannot flush M$ with no flush callback registered");
1552
1553 return KERN_FAILURE;
1554 } else {
1555 return mcache_flush_func(mcache_flush_service);
1556 }
1557 }
1558
1559
1560 kern_return_t ml_mem_fault_report_enable_register(void);
1561 kern_return_t
ml_mem_fault_report_enable_register(void)1562 ml_mem_fault_report_enable_register(void)
1563 {
1564 return KERN_SUCCESS;
1565 }
1566
1567 kern_return_t ml_amcc_error_inject_register(void);
1568 kern_return_t
ml_amcc_error_inject_register(void)1569 ml_amcc_error_inject_register(void)
1570 {
1571 return KERN_SUCCESS;
1572 }
1573
1574 kern_return_t ml_dcs_error_inject_register(void);
1575 kern_return_t
ml_dcs_error_inject_register(void)1576 ml_dcs_error_inject_register(void)
1577 {
1578 return KERN_SUCCESS;
1579 }
1580
1581
1582 extern lck_mtx_t pset_create_lock;
1583
1584 kern_return_t
ml_processor_register(ml_processor_info_t * in_processor_info,processor_t * processor_out,ipi_handler_t * ipi_handler_out,perfmon_interrupt_handler_func * pmi_handler_out)1585 ml_processor_register(ml_processor_info_t *in_processor_info,
1586 processor_t *processor_out, ipi_handler_t *ipi_handler_out,
1587 perfmon_interrupt_handler_func *pmi_handler_out)
1588 {
1589 cpu_data_t *this_cpu_datap;
1590 processor_set_t pset;
1591 boolean_t is_boot_cpu;
1592 static unsigned int reg_cpu_count = 0;
1593
1594 if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1595 return KERN_FAILURE;
1596 }
1597
1598 if ((unsigned)OSIncrementAtomic((SInt32*)®_cpu_count) >= topology_info.num_cpus) {
1599 return KERN_FAILURE;
1600 }
1601
1602 if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1603 is_boot_cpu = FALSE;
1604 this_cpu_datap = cpu_data_alloc(FALSE);
1605 cpu_data_init(this_cpu_datap);
1606 } else {
1607 this_cpu_datap = &BootCpuData;
1608 is_boot_cpu = TRUE;
1609 /*
1610 * Note that ml_processor_register happens for the boot cpu
1611 * *after* it starts running arbitrary threads, possibly
1612 * including *userspace*, depending on how long the CPU
1613 * services take to match.
1614 */
1615 }
1616
1617 assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1618
1619 this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1620
1621 if (!is_boot_cpu) {
1622 this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1623 cpu_data_register(this_cpu_datap);
1624 assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1625 }
1626
1627 this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1628 this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1629 nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency);
1630 this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr);
1631
1632 this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1633 this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1634
1635 this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1636 this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1637 this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1638
1639 this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1640 this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1641 this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1642 this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1643 this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1644 this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1645
1646 /*
1647 * Encode cpu_id, cluster_id to be stored in TPIDR_EL0 (see
1648 * cswitch.s:set_thread_registers, start.s:start_cpu) for consumption
1649 * by userspace.
1650 */
1651 this_cpu_datap->cpu_tpidr_el0 = ((this_cpu_datap->cpu_number << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) | \
1652 ((this_cpu_datap->cpu_cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1653 assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == this_cpu_datap->cpu_number);
1654 assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == this_cpu_datap->cpu_cluster_id);
1655
1656 #if HAS_CLUSTER
1657 this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1658 #else /* HAS_CLUSTER */
1659 this_cpu_datap->cluster_master = is_boot_cpu;
1660 #endif /* HAS_CLUSTER */
1661 lck_mtx_lock(&pset_create_lock);
1662 pset = pset_find(in_processor_info->cluster_id, NULL);
1663 kprintf("[%d]%s>pset_find(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cluster_id, pset ? pset->pset_id : -1);
1664 if (pset == NULL) {
1665 pset = pset_create(this_cpu_datap->cpu_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1666 assert(pset != PROCESSOR_SET_NULL);
1667 #if __AMP__
1668 kprintf("[%d]%s>pset_create(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, this_cpu_datap->cpu_cluster_id, pset->pset_id);
1669 #endif /* __AMP__ */
1670 }
1671 kprintf("[%d]%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1672 lck_mtx_unlock(&pset_create_lock);
1673
1674 processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1675 if (!is_boot_cpu) {
1676 processor_init(processor, this_cpu_datap->cpu_number, pset);
1677 }
1678
1679 *processor_out = processor;
1680 *ipi_handler_out = cpu_signal_handler;
1681 #if CPMU_AIC_PMI && CONFIG_CPU_COUNTERS
1682 *pmi_handler_out = mt_cpmu_aic_pmi;
1683 #else
1684 *pmi_handler_out = NULL;
1685 #endif /* CPMU_AIC_PMI && CONFIG_CPU_COUNTERS */
1686 if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1687 *in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1688 }
1689
1690 #if CONFIG_CPU_COUNTERS
1691 kpc_register_cpu(this_cpu_datap);
1692 #endif /* CONFIG_CPU_COUNTERS */
1693
1694 #ifdef APPLEEVEREST
1695 /**
1696 * H15 SoCs have PIO lockdown applied at early boot for secondary CPUs.
1697 * Save PIO lock base addreses.
1698 */
1699 const uint32_t log_id = in_processor_info->log_id;
1700 const unsigned int cluster_id = topology_info.cpus[log_id].cluster_id;
1701 this_cpu_datap->cpu_reg_paddr = topology_info.cpus[log_id].cpu_IMPL_pa;
1702 this_cpu_datap->acc_reg_paddr = topology_info.clusters[cluster_id].acc_IMPL_pa;
1703 this_cpu_datap->cpm_reg_paddr = topology_info.clusters[cluster_id].cpm_IMPL_pa;
1704 #endif
1705
1706 #if HAS_MTE
1707 /*
1708 * To avoid predictable allocation tags, we want to initialize
1709 * RGSR_EL1.SEED as early as possible. Unfortunately this happens
1710 * too early during secondary CPU startup to safely use the
1711 * corecrypto-backed PRNG. So the primary CPU will generate
1712 * the seeds on their behalf.
1713 */
1714 if (!is_boot_cpu) {
1715 this_cpu_datap->mte_rgsr_el1_seed = arm_mte_random_rgsr_el1_seed();
1716 }
1717 #endif
1718
1719 if (!is_boot_cpu) {
1720 random_cpu_init(this_cpu_datap->cpu_number);
1721 // now let next CPU register itself
1722 OSIncrementAtomic((SInt32*)&real_ncpus);
1723 }
1724
1725 os_atomic_or(&this_cpu_datap->cpu_flags, InitState, relaxed);
1726
1727 #if !USE_APPLEARMSMP
1728 /*
1729 * AppleARMCPU's external processor_start call is now a no-op, so
1730 * boot the processor directly when it's registered.
1731 *
1732 * It needs to be booted here for the boot processor to finish the
1733 * subsequent registerInterrupt operations and unblock the other cores.
1734 */
1735 processor_boot(processor);
1736 #endif /* !USE_APPLEARMSMP */
1737
1738 return KERN_SUCCESS;
1739 }
1740
1741 void
ml_init_arm_debug_interface(void * in_cpu_datap,vm_offset_t virt_address)1742 ml_init_arm_debug_interface(
1743 void * in_cpu_datap,
1744 vm_offset_t virt_address)
1745 {
1746 ((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1747 do_debugid();
1748 }
1749
1750 /*
1751 * Routine: init_ast_check
1752 * Function:
1753 */
1754 void
init_ast_check(__unused processor_t processor)1755 init_ast_check(
1756 __unused processor_t processor)
1757 {
1758 }
1759
1760 /*
1761 * Routine: cause_ast_check
1762 * Function:
1763 */
1764 void
cause_ast_check(processor_t processor)1765 cause_ast_check(
1766 processor_t processor)
1767 {
1768 assert(processor != PROCESSOR_NULL);
1769
1770 if (current_processor() != processor) {
1771 cpu_signal(processor_to_cpu_datap(processor), SIGPast, (void *)NULL, (void *)NULL);
1772 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 1 /* ast */, 0, 0, 0);
1773 }
1774 }
1775
1776 extern uint32_t cpu_idle_count;
1777
1778 void
ml_get_power_state(boolean_t * icp,boolean_t * pidlep)1779 ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
1780 {
1781 *icp = ml_at_interrupt_context();
1782 *pidlep = (cpu_idle_count == real_ncpus);
1783 }
1784
1785 /*
1786 * Routine: ml_cause_interrupt
1787 * Function: Generate a fake interrupt
1788 */
1789 void
ml_cause_interrupt(void)1790 ml_cause_interrupt(void)
1791 {
1792 return; /* BS_XXX */
1793 }
1794
1795 /* Map memory map IO space */
1796 vm_offset_t
ml_io_map(vm_offset_t phys_addr,vm_size_t size)1797 ml_io_map(
1798 vm_offset_t phys_addr,
1799 vm_size_t size)
1800 {
1801 return io_map(phys_addr, size, VM_WIMG_IO, VM_PROT_DEFAULT, false);
1802 }
1803
1804 /* Map memory map IO space (with protections specified) */
1805 vm_offset_t
ml_io_map_with_prot(vm_offset_t phys_addr,vm_size_t size,vm_prot_t prot)1806 ml_io_map_with_prot(
1807 vm_offset_t phys_addr,
1808 vm_size_t size,
1809 vm_prot_t prot)
1810 {
1811 return io_map(phys_addr, size, VM_WIMG_IO, prot, false);
1812 }
1813
1814 vm_offset_t
ml_io_map_unmappable(vm_offset_t phys_addr,vm_size_t size,unsigned int flags)1815 ml_io_map_unmappable(
1816 vm_offset_t phys_addr,
1817 vm_size_t size,
1818 unsigned int flags)
1819 {
1820 return io_map(phys_addr, size, flags, VM_PROT_DEFAULT, true);
1821 }
1822
1823 vm_offset_t
ml_io_map_wcomb(vm_offset_t phys_addr,vm_size_t size)1824 ml_io_map_wcomb(
1825 vm_offset_t phys_addr,
1826 vm_size_t size)
1827 {
1828 return io_map(phys_addr, size, VM_WIMG_WCOMB, VM_PROT_DEFAULT, false);
1829 }
1830
1831 void
ml_io_unmap(vm_offset_t addr,vm_size_t sz)1832 ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1833 {
1834 pmap_remove(kernel_pmap, addr, addr + sz);
1835 kmem_free(kernel_map, addr, sz);
1836 }
1837
1838 vm_map_address_t
ml_map_high_window(vm_offset_t phys_addr,vm_size_t len)1839 ml_map_high_window(
1840 vm_offset_t phys_addr,
1841 vm_size_t len)
1842 {
1843 return pmap_map_high_window_bd(phys_addr, len, VM_PROT_READ | VM_PROT_WRITE);
1844 }
1845
1846 vm_offset_t
ml_static_ptovirt(vm_offset_t paddr)1847 ml_static_ptovirt(
1848 vm_offset_t paddr)
1849 {
1850 return phystokv(paddr);
1851 }
1852
1853 vm_offset_t
ml_static_slide(vm_offset_t vaddr)1854 ml_static_slide(
1855 vm_offset_t vaddr)
1856 {
1857 vm_offset_t slid_vaddr = 0;
1858
1859 #if CONFIG_SPTM
1860 if ((vaddr >= vm_sptm_offsets.unslid_base) && (vaddr < vm_sptm_offsets.unslid_top)) {
1861 slid_vaddr = vaddr + vm_sptm_offsets.slide;
1862 } else if ((vaddr >= vm_txm_offsets.unslid_base) && (vaddr < vm_txm_offsets.unslid_top)) {
1863 slid_vaddr = vaddr + vm_txm_offsets.slide;
1864 } else
1865 #endif /* CONFIG_SPTM */
1866 {
1867 slid_vaddr = vaddr + vm_kernel_slide;
1868 }
1869
1870 if (!VM_KERNEL_IS_SLID(slid_vaddr)) {
1871 /* This is only intended for use on static kernel addresses. */
1872 return 0;
1873 }
1874
1875 return slid_vaddr;
1876 }
1877
1878 vm_offset_t
ml_static_unslide(vm_offset_t vaddr)1879 ml_static_unslide(
1880 vm_offset_t vaddr)
1881 {
1882 if (!VM_KERNEL_IS_SLID(vaddr)) {
1883 /* This is only intended for use on static kernel addresses. */
1884 return 0;
1885 }
1886
1887 #if CONFIG_SPTM
1888 /**
1889 * Addresses coming from the SPTM and TXM have a different slide than the
1890 * rest of the kernel.
1891 */
1892 if ((vaddr >= vm_sptm_offsets.slid_base) && (vaddr < vm_sptm_offsets.slid_top)) {
1893 return vaddr - vm_sptm_offsets.slide;
1894 }
1895
1896 if ((vaddr >= vm_txm_offsets.slid_base) && (vaddr < vm_txm_offsets.slid_top)) {
1897 return vaddr - vm_txm_offsets.slide;
1898 }
1899 #endif /* CONFIG_SPTM */
1900
1901 return vaddr - vm_kernel_slide;
1902 }
1903
1904 extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1905
1906 kern_return_t
ml_static_protect(vm_offset_t vaddr,vm_size_t size,vm_prot_t new_prot __unused)1907 ml_static_protect(
1908 vm_offset_t vaddr, /* kernel virtual address */
1909 vm_size_t size,
1910 vm_prot_t new_prot __unused)
1911 {
1912 #if CONFIG_SPTM
1913 /**
1914 * Retype any frames that may be passed to the VM to XNU_DEFAULT.
1915 */
1916 for (vm_offset_t sptm_vaddr_cur = vaddr; sptm_vaddr_cur < trunc_page_64(vaddr + size); sptm_vaddr_cur += PAGE_SIZE) {
1917 /* Check if this frame is XNU_DEFAULT and only retype it if is not */
1918 sptm_paddr_t sptm_paddr_cur = kvtophys_nofail(sptm_vaddr_cur);
1919 sptm_frame_type_t current_type = sptm_get_frame_type(sptm_paddr_cur);
1920 if (current_type != XNU_DEFAULT) {
1921 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1922 sptm_retype(sptm_paddr_cur, current_type, XNU_DEFAULT, retype_params);
1923 }
1924 }
1925
1926 return KERN_SUCCESS;
1927 #else /* CONFIG_SPTM */
1928 pt_entry_t arm_prot = 0;
1929 pt_entry_t arm_block_prot = 0;
1930 vm_offset_t vaddr_cur;
1931 ppnum_t ppn;
1932 kern_return_t result = KERN_SUCCESS;
1933
1934 if (vaddr < physmap_base) {
1935 panic("ml_static_protect(): %p < %p", (void *) vaddr, (void *) physmap_base);
1936 return KERN_FAILURE;
1937 }
1938
1939 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1940
1941 if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1942 panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1943 }
1944 if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1945 panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1946 }
1947
1948 /* Set up the protection bits, and block bits so we can validate block mappings. */
1949 if (new_prot & VM_PROT_WRITE) {
1950 arm_prot |= ARM_PTE_AP(AP_RWNA);
1951 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RWNA);
1952 } else {
1953 arm_prot |= ARM_PTE_AP(AP_RONA);
1954 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RONA);
1955 }
1956
1957 arm_prot |= ARM_PTE_NX;
1958 arm_block_prot |= ARM_TTE_BLOCK_NX;
1959
1960 if (!(new_prot & VM_PROT_EXECUTE)) {
1961 arm_prot |= ARM_PTE_PNX;
1962 arm_block_prot |= ARM_TTE_BLOCK_PNX;
1963 }
1964
1965 for (vaddr_cur = vaddr;
1966 vaddr_cur < trunc_page_64(vaddr + size);
1967 vaddr_cur += PAGE_SIZE) {
1968 ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1969 if (ppn != (vm_offset_t) NULL) {
1970 tt_entry_t *tte2;
1971 pt_entry_t *pte_p;
1972 pt_entry_t ptmp;
1973
1974 #if XNU_MONITOR
1975 assert(!pmap_is_monitor(ppn));
1976 assert(!TEST_PAGE_RATIO_4);
1977 #endif
1978
1979 tte2 = arm_kva_to_tte(vaddr_cur);
1980
1981 if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1982 if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1983 ((*tte2 & (ARM_TTE_BLOCK_NXMASK | ARM_TTE_BLOCK_PNXMASK | ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1984 /*
1985 * We can support ml_static_protect on a block mapping if the mapping already has
1986 * the desired protections. We still want to run checks on a per-page basis.
1987 */
1988 continue;
1989 }
1990
1991 result = KERN_FAILURE;
1992 break;
1993 }
1994
1995 pte_p = (pt_entry_t *)&((tt_entry_t*)(phystokv((*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1996 ptmp = *pte_p;
1997
1998 if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot)) {
1999 /*
2000 * The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
2001 * protections do not match the desired protections, then we will fail (as we cannot update
2002 * this mapping without updating other mappings as well).
2003 */
2004 result = KERN_FAILURE;
2005 break;
2006 }
2007
2008 __unreachable_ok_push
2009 if (TEST_PAGE_RATIO_4) {
2010 {
2011 unsigned int i;
2012 pt_entry_t *ptep_iter;
2013
2014 ptep_iter = pte_p;
2015 for (i = 0; i < 4; i++, ptep_iter++) {
2016 /* Note that there is a hole in the HINT sanity checking here. */
2017 ptmp = *ptep_iter;
2018
2019 /* We only need to update the page tables if the protections do not match. */
2020 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
2021 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
2022 *ptep_iter = ptmp;
2023 }
2024 }
2025 }
2026 } else {
2027 ptmp = *pte_p;
2028 /* We only need to update the page tables if the protections do not match. */
2029 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
2030 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
2031 *pte_p = ptmp;
2032 }
2033 }
2034 __unreachable_ok_pop
2035 }
2036 }
2037
2038 if (vaddr_cur > vaddr) {
2039 assert(((vaddr_cur - vaddr) & 0xFFFFFFFF00000000ULL) == 0);
2040 flush_mmu_tlb_region(vaddr, (uint32_t)(vaddr_cur - vaddr));
2041 }
2042
2043
2044 return result;
2045 #endif /* CONFIG_SPTM */
2046 }
2047
2048 #if defined(CONFIG_SPTM)
2049 /*
2050 * Returns true if the given physical address is in one of the boot kernelcache ranges.
2051 */
2052 static bool
ml_physaddr_in_bootkc_range(vm_offset_t physaddr)2053 ml_physaddr_in_bootkc_range(vm_offset_t physaddr)
2054 {
2055 for (int i = 0; i < arm_vm_kernelcache_numranges; i++) {
2056 if (physaddr >= arm_vm_kernelcache_ranges[i].start_phys && physaddr < arm_vm_kernelcache_ranges[i].end_phys) {
2057 return true;
2058 }
2059 }
2060 return false;
2061 }
2062 #endif /* defined(CONFIG_SPTM) */
2063
2064 /*
2065 * List of ml_static_mfree()'d pages that have been freed before
2066 * physical aperture sliding has taken place. If sliding has not
2067 * occurred yet, ml_static_mfree() will create pages, but not add them
2068 * to the free page queue yet. If it did, code that e.g. calls
2069 * pmap_page_alloc() could get a page back whose physical aperture
2070 * will later be slid, potentially leaving dangling pointers pointing
2071 * to the old kva of the page behind.
2072 *
2073 * Such errors are hard to avoid and hard to debug, so instead we
2074 * queue pages in this dedicated list, and release all accumulated
2075 * pages into the regular free queue all at once right after phys
2076 * aperture sliding took place in arm_vm_prot_finalize().
2077 */
2078 static
2079 vm_page_list_t ml_static_mfree_pre_slide_list;
2080
2081 /*
2082 * Indicates whether we still need ml_static_mfree() to queue up pages
2083 * in ml_static_free_pre_slide_list. If not, ml_static_mfree()
2084 * directly releases newly created pages into the free queue instead.
2085 */
2086 static
2087 bool ml_static_mfree_queue_up = true;
2088
2089 /*
2090 * Release all pages queued up by ml_static_mfree() to the free queue.
2091 * This should be called after physical aperture sliding has taken
2092 * place (i.e. in arm_vm_prot_finalize()), to indicate that the
2093 * physical aperture is now stable, and subsequently ml_static_mfree()
2094 * can directly release pages into the free queue instead.
2095 */
2096 static void
ml_release_deferred_pages(void)2097 ml_release_deferred_pages(void)
2098 {
2099 vm_page_free_list(ml_static_mfree_pre_slide_list.vmpl_head, false);
2100 ml_static_mfree_queue_up = false;
2101 }
2102
2103 /*
2104 * Routine: ml_static_mfree
2105 * Function:
2106 */
2107 void
ml_static_mfree(vm_offset_t vaddr,vm_size_t size)2108 ml_static_mfree(
2109 vm_offset_t vaddr,
2110 vm_size_t size)
2111 {
2112 vm_offset_t vaddr_cur;
2113 vm_offset_t paddr_cur;
2114 ppnum_t ppn;
2115 uint32_t freed_pages = 0;
2116 uint32_t freed_kernelcache_pages = 0;
2117
2118
2119 /* It is acceptable (if bad) to fail to free. */
2120 if (vaddr < physmap_base) {
2121 return;
2122 }
2123
2124 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
2125
2126 for (vaddr_cur = vaddr;
2127 vaddr_cur < trunc_page_64(vaddr + size);
2128 vaddr_cur += PAGE_SIZE) {
2129 /*
2130 * Some clients invoke ml_static_mfree on non-physical aperture
2131 * addresses. To support this, we convert the virtual address
2132 * to a physical aperture address, and remove all mappings of
2133 * the page as we update the physical aperture protections.
2134 */
2135 vm_offset_t vaddr_papt = phystokv(kvtophys(vaddr_cur));
2136 ppn = pmap_find_phys(kernel_pmap, vaddr_papt);
2137
2138 if (ppn != (vm_offset_t) NULL) {
2139 /*
2140 * It is not acceptable to fail to update the protections on a page
2141 * we will release to the VM. We need to either panic or continue.
2142 * For now, we'll panic (to help flag if there is memory we can
2143 * reclaim).
2144 */
2145 pmap_disconnect(ppn);
2146 if (ml_static_protect(vaddr_papt, PAGE_SIZE, VM_PROT_WRITE | VM_PROT_READ) != KERN_SUCCESS) {
2147 panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
2148 }
2149
2150 paddr_cur = ptoa(ppn);
2151
2152
2153 if (__probable(!ml_static_mfree_queue_up)) {
2154 vm_page_create_canonical(ppn);
2155 } else {
2156 vm_page_t m = vm_page_create(ppn, true, Z_WAITOK);
2157
2158 vm_page_list_push(&ml_static_mfree_pre_slide_list, m);
2159 }
2160
2161 freed_pages++;
2162 #if defined(CONFIG_SPTM)
2163 if (ml_physaddr_in_bootkc_range(paddr_cur))
2164 #else
2165 if (paddr_cur >= arm_vm_kernelcache_phys_start && paddr_cur < arm_vm_kernelcache_phys_end)
2166 #endif
2167 {
2168 freed_kernelcache_pages++;
2169 }
2170 }
2171 }
2172
2173 vm_page_lockspin_queues();
2174 vm_page_wire_count -= freed_pages;
2175 vm_page_wire_count_initial -= freed_pages;
2176 vm_page_kernelcache_count -= freed_kernelcache_pages;
2177 vm_page_unlock_queues();
2178 #if DEBUG
2179 kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
2180 #endif
2181 }
2182
2183 /*
2184 * Routine: ml_page_protection_type
2185 * Function: Returns the type of page protection that the system supports.
2186 */
2187 ml_page_protection_t
ml_page_protection_type(void)2188 ml_page_protection_type(void)
2189 {
2190 #if CONFIG_SPTM
2191 return 2;
2192 #elif XNU_MONITOR
2193 return 1;
2194 #else
2195 return 0;
2196 #endif
2197 }
2198
2199 /* virtual to physical on wired pages */
2200 vm_offset_t
ml_vtophys(vm_offset_t vaddr)2201 ml_vtophys(vm_offset_t vaddr)
2202 {
2203 return kvtophys(vaddr);
2204 }
2205
2206 /*
2207 * Routine: ml_nofault_copy
2208 * Function: Perform a physical mode copy if the source and destination have
2209 * valid translations in the kernel pmap. If translations are present, they are
2210 * assumed to be wired; e.g., no attempt is made to guarantee that the
2211 * translations obtained remain valid for the duration of the copy process.
2212 */
2213 vm_size_t
ml_nofault_copy(vm_offset_t virtsrc,vm_offset_t virtdst,vm_size_t size)2214 ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
2215 {
2216 addr64_t cur_phys_dst, cur_phys_src;
2217 vm_size_t count, nbytes = 0;
2218
2219 while (size > 0) {
2220 if (!(cur_phys_src = kvtophys(virtsrc))) {
2221 break;
2222 }
2223 if (!(cur_phys_dst = kvtophys(virtdst))) {
2224 break;
2225 }
2226 if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) ||
2227 !pmap_valid_address(trunc_page_64(cur_phys_src))) {
2228 break;
2229 }
2230 count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
2231 if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
2232 count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
2233 }
2234 if (count > size) {
2235 count = size;
2236 }
2237
2238 #if HAS_MTE
2239 bcopy_phys_with_options(cur_phys_src, cur_phys_dst, count, cppvDisableTagCheck);
2240 #else /* HAS_MTE */
2241 bcopy_phys(cur_phys_src, cur_phys_dst, count);
2242 #endif /* HAS_MTE */
2243
2244 nbytes += count;
2245 virtsrc += count;
2246 virtdst += count;
2247 size -= count;
2248 }
2249
2250 return nbytes;
2251 }
2252
2253 /*
2254 * Routine: ml_validate_nofault
2255 * Function: Validate that ths address range has a valid translations
2256 * in the kernel pmap. If translations are present, they are
2257 * assumed to be wired; i.e. no attempt is made to guarantee
2258 * that the translation persist after the check.
2259 * Returns: TRUE if the range is mapped and will not cause a fault,
2260 * FALSE otherwise.
2261 */
2262
2263 boolean_t
ml_validate_nofault(vm_offset_t virtsrc,vm_size_t size)2264 ml_validate_nofault(
2265 vm_offset_t virtsrc, vm_size_t size)
2266 {
2267 addr64_t cur_phys_src;
2268 uint32_t count;
2269
2270 while (size > 0) {
2271 if (!(cur_phys_src = kvtophys(virtsrc))) {
2272 return FALSE;
2273 }
2274 if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
2275 return FALSE;
2276 }
2277 count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
2278 if (count > size) {
2279 count = (uint32_t)size;
2280 }
2281
2282 virtsrc += count;
2283 size -= count;
2284 }
2285
2286 return TRUE;
2287 }
2288
2289 void
ml_get_bouncepool_info(vm_offset_t * phys_addr,vm_size_t * size)2290 ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
2291 {
2292 *phys_addr = 0;
2293 *size = 0;
2294 }
2295
2296 void
active_rt_threads(__unused boolean_t active)2297 active_rt_threads(__unused boolean_t active)
2298 {
2299 }
2300
2301 static void
cpu_qos_cb_default(__unused int urgency,__unused uint64_t qos_param1,__unused uint64_t qos_param2)2302 cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
2303 {
2304 return;
2305 }
2306
2307 cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
2308
2309 void
cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)2310 cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
2311 {
2312 if (cpu_qos_cb != NULL) {
2313 cpu_qos_update = cpu_qos_cb;
2314 } else {
2315 cpu_qos_update = cpu_qos_cb_default;
2316 }
2317 }
2318
2319 void
thread_tell_urgency(thread_urgency_t urgency,uint64_t rt_period,uint64_t rt_deadline,uint64_t sched_latency __unused,__unused thread_t nthread)2320 thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
2321 {
2322 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
2323
2324 cpu_qos_update((int)urgency, rt_period, rt_deadline);
2325
2326 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
2327 }
2328
2329 void
machine_run_count(__unused uint32_t count)2330 machine_run_count(__unused uint32_t count)
2331 {
2332 }
2333
2334 #if KASAN
2335 vm_offset_t ml_stack_base(void);
2336 vm_size_t ml_stack_size(void);
2337
2338 vm_offset_t
ml_stack_base(void)2339 ml_stack_base(void)
2340 {
2341 uintptr_t local = (uintptr_t) &local;
2342 vm_offset_t intstack_top_ptr;
2343
2344 intstack_top_ptr = getCpuDatap()->intstack_top;
2345 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2346 return intstack_top_ptr - INTSTACK_SIZE;
2347 } else {
2348 return current_thread()->kernel_stack;
2349 }
2350 }
2351 vm_size_t
ml_stack_size(void)2352 ml_stack_size(void)
2353 {
2354 uintptr_t local = (uintptr_t) &local;
2355 vm_offset_t intstack_top_ptr;
2356
2357 intstack_top_ptr = getCpuDatap()->intstack_top;
2358 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2359 return INTSTACK_SIZE;
2360 } else {
2361 return kernel_stack_size;
2362 }
2363 }
2364 #endif
2365
2366 #ifdef CONFIG_KCOV
2367
2368 kcov_cpu_data_t *
current_kcov_data(void)2369 current_kcov_data(void)
2370 {
2371 return ¤t_cpu_datap()->cpu_kcov_data;
2372 }
2373
2374 kcov_cpu_data_t *
cpu_kcov_data(int cpuid)2375 cpu_kcov_data(int cpuid)
2376 {
2377 return &cpu_datap(cpuid)->cpu_kcov_data;
2378 }
2379
2380 #endif /* CONFIG_KCOV */
2381
2382 boolean_t
machine_timeout_suspended(void)2383 machine_timeout_suspended(void)
2384 {
2385 return FALSE;
2386 }
2387
2388 kern_return_t
ml_interrupt_prewarm(__unused uint64_t deadline)2389 ml_interrupt_prewarm(__unused uint64_t deadline)
2390 {
2391 return KERN_FAILURE;
2392 }
2393
2394 #if HAS_APPLE_GENERIC_TIMER
2395 /* The kernel timer APIs always use the Apple timebase */
2396 #define KERNEL_CNTV_TVAL_EL0 "S3_1_C15_C15_4"
2397 #define KERNEL_CNTVCT_EL0 "S3_4_C15_C11_7"
2398 #define KERNEL_CNTVCTSS_EL0 "S3_4_C15_C10_6"
2399 #define KERNEL_CNTV_CTL_EL0 "S3_1_C15_C0_5"
2400 #define KERNEL_CNTKCTL_EL1 "S3_4_C15_C9_6"
2401 #else
2402 #define KERNEL_CNTV_TVAL_EL0 "CNTV_TVAL_EL0"
2403 #define KERNEL_CNTVCT_EL0 "CNTVCT_EL0"
2404 #define KERNEL_CNTVCTSS_EL0 "CNTVCTSS_EL0"
2405 #define KERNEL_CNTV_CTL_EL0 "CNTV_CTL_EL0"
2406 #define KERNEL_CNTKCTL_EL1 "CNTKCTL_EL1"
2407 #endif
2408
2409 /*
2410 * Assumes fiq, irq disabled.
2411 */
2412 void
ml_set_decrementer(uint32_t dec_value)2413 ml_set_decrementer(uint32_t dec_value)
2414 {
2415 cpu_data_t *cdp = getCpuDatap();
2416
2417 assert(ml_get_interrupts_enabled() == FALSE);
2418 cdp->cpu_decrementer = dec_value;
2419
2420 if (cdp->cpu_set_decrementer_func) {
2421 cdp->cpu_set_decrementer_func(dec_value);
2422 } else {
2423 __builtin_arm_wsr64(KERNEL_CNTV_TVAL_EL0, (uint64_t)dec_value);
2424 }
2425 }
2426
2427 /**
2428 * Perform a read of the timebase which is permitted to be executed
2429 * speculatively and/or out of program order.
2430 */
2431 static inline uint64_t
speculative_timebase(void)2432 speculative_timebase(void)
2433 {
2434 return __builtin_arm_rsr64(KERNEL_CNTVCT_EL0);
2435 }
2436
2437 /**
2438 * Read a non-speculative view of the timebase if one is available,
2439 * otherwise fallback on an ISB to prevent prevent speculation and
2440 * enforce ordering.
2441 */
2442 static inline uint64_t
nonspeculative_timebase(void)2443 nonspeculative_timebase(void)
2444 {
2445 #if __ARM_ARCH_8_6__
2446 return __builtin_arm_rsr64(KERNEL_CNTVCTSS_EL0);
2447 #else
2448 // ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
2449 // "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
2450 // to other instructions executed on the same processor."
2451 __builtin_arm_isb(ISB_SY);
2452 return speculative_timebase();
2453 #endif
2454 }
2455
2456
2457 uint64_t
ml_get_hwclock()2458 ml_get_hwclock()
2459 {
2460 uint64_t timebase = nonspeculative_timebase();
2461 return timebase;
2462 }
2463
2464 uint64_t
ml_get_hwclock_speculative()2465 ml_get_hwclock_speculative()
2466 {
2467 uint64_t timebase = speculative_timebase();
2468 return timebase;
2469 }
2470
2471 uint64_t
ml_get_timebase()2472 ml_get_timebase()
2473 {
2474 uint64_t clock, timebase;
2475
2476 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
2477 do {
2478 timebase = getCpuDatap()->cpu_base_timebase;
2479 os_compiler_barrier();
2480 clock = ml_get_hwclock();
2481 os_compiler_barrier();
2482 } while (getCpuDatap()->cpu_base_timebase != timebase);
2483
2484 return clock + timebase;
2485 }
2486
2487 /**
2488 * Issue a barrier that guarantees all prior memory accesses will complete
2489 * before any subsequent timebase reads.
2490 */
2491 void
ml_memory_to_timebase_fence(void)2492 ml_memory_to_timebase_fence(void)
2493 {
2494 __builtin_arm_dmb(DMB_SY);
2495 const uint64_t take_backwards_branch = 0;
2496 asm volatile (
2497 "1:"
2498 "ldr x0, [%[take_backwards_branch]]" "\n"
2499 "cbnz x0, 1b" "\n"
2500 :
2501 : [take_backwards_branch] "r"(&take_backwards_branch)
2502 : "x0"
2503 );
2504
2505 /* throwaway read to prevent ml_get_speculative_timebase() reordering */
2506 (void)ml_get_hwclock();
2507 }
2508
2509 /**
2510 * Issue a barrier that guarantees all prior timebase reads will
2511 * be ordered before any subsequent memory accesses.
2512 */
2513 void
ml_timebase_to_memory_fence(void)2514 ml_timebase_to_memory_fence(void)
2515 {
2516 __builtin_arm_isb(ISB_SY);
2517 }
2518
2519 /*
2520 * Get the speculative timebase without an ISB.
2521 */
2522 uint64_t
ml_get_speculative_timebase(void)2523 ml_get_speculative_timebase(void)
2524 {
2525 uint64_t clock, timebase;
2526
2527 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
2528 do {
2529 timebase = getCpuDatap()->cpu_base_timebase;
2530 os_compiler_barrier();
2531 clock = speculative_timebase();
2532
2533 os_compiler_barrier();
2534 } while (getCpuDatap()->cpu_base_timebase != timebase);
2535
2536 return clock + timebase;
2537 }
2538
2539 uint64_t
ml_get_timebase_entropy(void)2540 ml_get_timebase_entropy(void)
2541 {
2542 return ml_get_speculative_timebase();
2543 }
2544
2545 uint32_t
ml_get_decrementer(void)2546 ml_get_decrementer(void)
2547 {
2548 cpu_data_t *cdp = getCpuDatap();
2549 uint32_t dec;
2550
2551 assert(ml_get_interrupts_enabled() == FALSE);
2552
2553 if (cdp->cpu_get_decrementer_func) {
2554 dec = cdp->cpu_get_decrementer_func();
2555 } else {
2556 uint64_t wide_val;
2557
2558 wide_val = __builtin_arm_rsr64(KERNEL_CNTV_TVAL_EL0);
2559 dec = (uint32_t)wide_val;
2560 assert(wide_val == (uint64_t)dec);
2561 }
2562
2563 return dec;
2564 }
2565
2566 boolean_t
ml_get_timer_pending(void)2567 ml_get_timer_pending(void)
2568 {
2569 uint64_t cntv_ctl = __builtin_arm_rsr64(KERNEL_CNTV_CTL_EL0);
2570 return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
2571 }
2572
2573 __attribute__((noreturn))
2574 void
platform_syscall(arm_saved_state_t * state)2575 platform_syscall(arm_saved_state_t *state)
2576 {
2577 uint32_t code;
2578
2579 #define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2580
2581 code = (uint32_t)get_saved_state_reg(state, 3);
2582
2583 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_START,
2584 get_saved_state_reg(state, 0),
2585 get_saved_state_reg(state, 1),
2586 get_saved_state_reg(state, 2));
2587
2588 switch (code) {
2589 case 2:
2590 /* set cthread */
2591 platform_syscall_kprintf("set cthread self.\n");
2592 thread_set_cthread_self(get_saved_state_reg(state, 0));
2593 break;
2594 case 3:
2595 /* get cthread */
2596 platform_syscall_kprintf("get cthread self.\n");
2597 set_user_saved_state_reg(state, 0, thread_get_cthread_self());
2598 break;
2599 case 0: /* I-Cache flush (removed) */
2600 case 1: /* D-Cache flush (removed) */
2601 default:
2602 platform_syscall_kprintf("unknown: %d\n", code);
2603 break;
2604 }
2605
2606 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_END,
2607 get_saved_state_reg(state, 0));
2608
2609 thread_exception_return();
2610 }
2611
2612 static void
_enable_timebase_event_stream(uint32_t bit_index)2613 _enable_timebase_event_stream(uint32_t bit_index)
2614 {
2615 if (bit_index >= 64) {
2616 panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2617 }
2618
2619 uint64_t cntkctl = __builtin_arm_rsr64(KERNEL_CNTKCTL_EL1);
2620
2621 cntkctl |= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2622 cntkctl |= CNTKCTL_EL1_EVNTEN;
2623 cntkctl |= CNTKCTL_EL1_EVENTDIR; /* 1->0; why not? */
2624
2625 /*
2626 * If the SOC supports it (and it isn't broken), enable
2627 * EL0 access to the timebase registers.
2628 */
2629 if (user_timebase_type() != USER_TIMEBASE_NONE) {
2630 cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2631 }
2632
2633 __builtin_arm_wsr64(KERNEL_CNTKCTL_EL1, cntkctl);
2634
2635 #if HAS_APPLE_GENERIC_TIMER
2636 /* Enable EL0 access to the ARM timebase registers too */
2637 uint64_t arm_cntkctl = __builtin_arm_rsr64("CNTKCTL_EL1");
2638 arm_cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2639 __builtin_arm_wsr64("CNTKCTL_EL1", arm_cntkctl);
2640 #endif
2641 }
2642
2643 /*
2644 * Turn timer on, unmask that interrupt.
2645 */
2646 static void
_enable_virtual_timer(void)2647 _enable_virtual_timer(void)
2648 {
2649 uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
2650
2651 __builtin_arm_wsr64(KERNEL_CNTV_CTL_EL0, cntvctl);
2652 /* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
2653 __builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2654 #if HAS_APPLE_GENERIC_TIMER
2655 __builtin_arm_wsr64("S3_1_C15_C13_4", CNTP_CTL_EL0_IMASKED);
2656 #endif
2657 }
2658
2659 void
fiq_context_init(boolean_t enable_fiq __unused)2660 fiq_context_init(boolean_t enable_fiq __unused)
2661 {
2662 /* Interrupts still disabled. */
2663 assert(ml_get_interrupts_enabled() == FALSE);
2664 _enable_virtual_timer();
2665 }
2666
2667 void
wfe_timeout_init(void)2668 wfe_timeout_init(void)
2669 {
2670 _enable_timebase_event_stream(arm64_eventi);
2671 }
2672
2673 /**
2674 * Configures, but does not enable, the WFE event stream. The event stream
2675 * generates an event at a set interval to act as a timeout for WFEs.
2676 *
2677 * This function sets the static global variable arm64_eventi to be the proper
2678 * bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2679 * period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2680 * is used by wfe_timeout_init to actually poke the registers and enable the
2681 * event stream.
2682 *
2683 * The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2684 * is the trigger for the system to generate an event. The trigger can occur on
2685 * either the rising or falling edge of the bit depending on the value of
2686 * CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2687 * falling edge (1->0) transition to generate events.
2688 */
2689 void
wfe_timeout_configure(void)2690 wfe_timeout_configure(void)
2691 {
2692 /* Could fill in our own ops here, if we needed them */
2693 uint64_t ticks_per_sec, ticks_per_event, events_per_sec = 0;
2694 uint32_t bit_index;
2695
2696 if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
2697 if (events_per_sec <= 0) {
2698 events_per_sec = 1;
2699 } else if (events_per_sec > USEC_PER_SEC) {
2700 events_per_sec = USEC_PER_SEC;
2701 }
2702 } else {
2703 events_per_sec = USEC_PER_SEC;
2704 }
2705 ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2706 ticks_per_event = ticks_per_sec / events_per_sec;
2707
2708 /* Bit index of next power of two greater than ticks_per_event */
2709 bit_index = flsll(ticks_per_event) - 1;
2710 /* Round up to next power of two if ticks_per_event is initially power of two */
2711 if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
2712 bit_index++;
2713 }
2714
2715 /*
2716 * The timer can only trigger on rising or falling edge, not both; we don't
2717 * care which we trigger on, but we do need to adjust which bit we are
2718 * interested in to account for this.
2719 *
2720 * In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2721 * falling edge of the given bit. Therefore, we must decrement the bit index
2722 * by one as when the bit before the one we care about makes a 1 -> 0
2723 * transition, the bit we care about makes a 0 -> 1 transition.
2724 *
2725 * For example if we want an event generated every 8 ticks (if we calculated
2726 * a bit_index of 3), we would want the event to be generated whenever the
2727 * lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2728 * see that the bit at index 2 makes a falling transition in this scenario,
2729 * so we would want EVENTI to be 2 instead of 3.
2730 */
2731 if (bit_index != 0) {
2732 bit_index--;
2733 }
2734
2735 arm64_eventi = bit_index;
2736 }
2737
2738 boolean_t
ml_delay_should_spin(uint64_t interval)2739 ml_delay_should_spin(uint64_t interval)
2740 {
2741 cpu_data_t *cdp = getCpuDatap();
2742
2743 if (cdp->cpu_idle_latency) {
2744 return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2745 } else {
2746 /*
2747 * Early boot, latency is unknown. Err on the side of blocking,
2748 * which should always be safe, even if slow
2749 */
2750 return FALSE;
2751 }
2752 }
2753
2754 boolean_t
ml_thread_is64bit(thread_t thread)2755 ml_thread_is64bit(thread_t thread)
2756 {
2757 return thread_is_64bit_addr(thread);
2758 }
2759
2760 void
ml_delay_on_yield(void)2761 ml_delay_on_yield(void)
2762 {
2763 #if DEVELOPMENT || DEBUG
2764 if (yield_delay_us) {
2765 delay(yield_delay_us);
2766 }
2767 #endif
2768 }
2769
2770 void
ml_timer_evaluate(void)2771 ml_timer_evaluate(void)
2772 {
2773 }
2774
2775 boolean_t
ml_timer_forced_evaluation(void)2776 ml_timer_forced_evaluation(void)
2777 {
2778 return FALSE;
2779 }
2780
2781 void
ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)2782 ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2783 {
2784 /*
2785 * For now: update the resource coalition stats of the
2786 * current thread's coalition
2787 */
2788 task_coalition_update_gpu_stats(current_task(), gpu_ns_delta);
2789 }
2790
2791 uint64_t
ml_gpu_stat(__unused thread_t t)2792 ml_gpu_stat(__unused thread_t t)
2793 {
2794 return 0;
2795 }
2796
2797 thread_t
current_thread(void)2798 current_thread(void)
2799 {
2800 return current_thread_fast();
2801 }
2802
2803 #if defined(HAS_APPLE_PAC)
2804 uint8_t
ml_task_get_disable_user_jop(task_t task)2805 ml_task_get_disable_user_jop(task_t task)
2806 {
2807 assert(task);
2808 return task->disable_user_jop;
2809 }
2810
2811 void
ml_task_set_disable_user_jop(task_t task,uint8_t disable_user_jop)2812 ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2813 {
2814 assert(task);
2815 task->disable_user_jop = disable_user_jop;
2816 }
2817
2818 void
ml_thread_set_disable_user_jop(thread_t thread,uint8_t disable_user_jop)2819 ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2820 {
2821 assert(thread);
2822 if (disable_user_jop) {
2823 thread->machine.arm_machine_flags |= ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2824 } else {
2825 thread->machine.arm_machine_flags &= ~ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2826 }
2827 }
2828
2829 void
ml_task_set_rop_pid(task_t task,task_t parent_task,boolean_t inherit)2830 ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2831 {
2832 if (inherit) {
2833 task->rop_pid = parent_task->rop_pid;
2834 } else {
2835 task->rop_pid = early_random();
2836 }
2837 }
2838
2839 /**
2840 * jop_pid may be inherited from the parent task or generated inside the shared
2841 * region. Unfortunately these two parameters are available at very different
2842 * times during task creation, so we need to split this into two steps.
2843 */
2844 void
ml_task_set_jop_pid(task_t task,task_t parent_task,boolean_t inherit,boolean_t disable_user_jop)2845 ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit, boolean_t disable_user_jop)
2846 {
2847 if (inherit) {
2848 task->jop_pid = parent_task->jop_pid;
2849 } else if (disable_user_jop) {
2850 task->jop_pid = ml_non_arm64e_user_jop_pid();
2851 } else {
2852 task->jop_pid = ml_default_jop_pid();
2853 }
2854 }
2855
2856 void
ml_task_set_jop_pid_from_shared_region(task_t task,boolean_t disable_user_jop)2857 ml_task_set_jop_pid_from_shared_region(task_t task, boolean_t disable_user_jop)
2858 {
2859 if (disable_user_jop) {
2860 task->jop_pid = ml_non_arm64e_user_jop_pid();
2861 return;
2862 }
2863
2864 vm_shared_region_t sr = vm_shared_region_get(task);
2865 /*
2866 * If there's no shared region, we can assign the key arbitrarily. This
2867 * typically happens when Mach-O image activation failed part of the way
2868 * through, and this task is in the middle of dying with SIGKILL anyway.
2869 */
2870 if (__improbable(!sr)) {
2871 task->jop_pid = early_random();
2872 return;
2873 }
2874 vm_shared_region_deallocate(sr);
2875
2876 /*
2877 * Similarly we have to worry about jetsam having killed the task and
2878 * already cleared the shared_region_id.
2879 */
2880 task_lock(task);
2881 if (task->shared_region_id != NULL) {
2882 task->jop_pid = shared_region_find_key(task->shared_region_id);
2883 } else {
2884 task->jop_pid = early_random();
2885 }
2886 task_unlock(task);
2887 }
2888
2889 void
ml_thread_set_jop_pid(thread_t thread,task_t task)2890 ml_thread_set_jop_pid(thread_t thread, task_t task)
2891 {
2892 thread->machine.jop_pid = task->jop_pid;
2893 }
2894 #endif /* defined(HAS_APPLE_PAC) */
2895
2896 #if DEVELOPMENT || DEBUG
2897 static uint64_t minor_badness_suffered = 0;
2898 #endif
2899 void
ml_report_minor_badness(uint32_t __unused badness_id)2900 ml_report_minor_badness(uint32_t __unused badness_id)
2901 {
2902 #if DEVELOPMENT || DEBUG
2903 (void)os_atomic_or(&minor_badness_suffered, 1ULL << badness_id, relaxed);
2904 #endif
2905 }
2906
2907 #if HAS_APPLE_PAC
2908 /**
2909 * Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2910 */
2911 void *
ml_poison_ptr(void * ptr,ptrauth_key key)2912 ml_poison_ptr(void *ptr, ptrauth_key key)
2913 {
2914 bool b_key = key & (1ULL << 0);
2915 uint64_t error_code;
2916 if (b_key) {
2917 error_code = 2;
2918 } else {
2919 error_code = 1;
2920 }
2921
2922 bool kernel_pointer = (uintptr_t)ptr & (1ULL << 55);
2923 bool data_key = key & (1ULL << 1);
2924 /* When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters */
2925 bool tbi = data_key && !kernel_pointer;
2926 unsigned int poison_shift;
2927 if (tbi) {
2928 poison_shift = 53;
2929 } else {
2930 poison_shift = 61;
2931 }
2932
2933 uintptr_t poisoned = (uintptr_t)ptr;
2934 poisoned &= ~(3ULL << poison_shift);
2935 poisoned |= error_code << poison_shift;
2936 return (void *)poisoned;
2937 }
2938 #endif /* HAS_APPLE_PAC */
2939
2940 #ifdef CONFIG_XNUPOST
2941 void
ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_addr)2942 ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2943 {
2944 thread_t thread = current_thread();
2945 thread->machine.expected_fault_handler = expected_fault_handler;
2946 thread->machine.expected_fault_addr = expected_fault_addr;
2947 thread->machine.expected_fault_pc = 0;
2948 }
2949
2950 /** Expect an exception to be thrown at EXPECTED_FAULT_PC */
2951 void
ml_expect_fault_pc_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_pc)2952 ml_expect_fault_pc_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_pc)
2953 {
2954 thread_t thread = current_thread();
2955 thread->machine.expected_fault_handler = expected_fault_handler;
2956 thread->machine.expected_fault_addr = 0;
2957 uintptr_t raw_func = (uintptr_t)ptrauth_strip(
2958 (void *)expected_fault_pc,
2959 ptrauth_key_function_pointer);
2960 thread->machine.expected_fault_pc = raw_func;
2961 }
2962
2963 void
ml_expect_fault_end(void)2964 ml_expect_fault_end(void)
2965 {
2966 thread_t thread = current_thread();
2967 thread->machine.expected_fault_handler = NULL;
2968 thread->machine.expected_fault_addr = 0;
2969 thread->machine.expected_fault_pc = 0;
2970 }
2971 #endif /* CONFIG_XNUPOST */
2972
2973 void
ml_hibernate_active_pre(void)2974 ml_hibernate_active_pre(void)
2975 {
2976 #if HIBERNATION
2977 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2978
2979 hibernate_rebuild_vm_structs();
2980
2981 #if CONFIG_SPTM
2982 /* Tell the pmap that hibernation restoration has started. */
2983 extern secure_hmac_hib_state_t pmap_hibernation_state;
2984 pmap_hibernation_state = SECURE_HMAC_HIB_RESTORE;
2985 #endif /* CONFIG_SPTM */
2986 }
2987 #endif /* HIBERNATION */
2988 }
2989
2990 void
ml_hibernate_active_post(void)2991 ml_hibernate_active_post(void)
2992 {
2993 #if HIBERNATION
2994 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2995 hibernate_machine_init();
2996 hibernate_vm_lock_end();
2997 current_cpu_datap()->cpu_hibernate = 0;
2998 }
2999 #endif /* HIBERNATION */
3000 }
3001
3002 /**
3003 * Return back a machine-dependent array of address space regions that should be
3004 * reserved by the VM (pre-mapped in the address space). This will prevent user
3005 * processes from allocating or deallocating from within these regions.
3006 *
3007 * @param vm_is64bit True if the process has a 64-bit address space.
3008 * @param regions An out parameter representing an array of regions to reserve.
3009 *
3010 * @return The number of reserved regions returned through `regions`.
3011 */
3012 size_t
ml_get_vm_reserved_regions(bool vm_is64bit,const struct vm_reserved_region ** regions)3013 ml_get_vm_reserved_regions(bool vm_is64bit, const struct vm_reserved_region **regions)
3014 {
3015 assert(regions != NULL);
3016
3017 /**
3018 * Reserved regions only apply to 64-bit address spaces. This is because
3019 * we only expect to grow the maximum user VA address on 64-bit address spaces
3020 * (we've essentially already reached the max for 32-bit spaces). The reserved
3021 * regions should safely fall outside of the max user VA for 32-bit processes.
3022 */
3023 if (vm_is64bit) {
3024 *regions = vm_reserved_regions;
3025 return ARRAY_COUNT(vm_reserved_regions);
3026 } else {
3027 /* Don't reserve any VA regions on arm64_32 processes. */
3028 *regions = NULL;
3029 return 0;
3030 }
3031 }
3032
3033 /* These WFE recommendations are expected to be updated on a relatively
3034 * infrequent cadence, possibly from a different cluster, hence
3035 * false cacheline sharing isn't expected to be material
3036 */
3037 static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
3038
3039 uint32_t
ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id,uint64_t wfe_timeout_abstime_interval,__unused uint64_t wfe_hint_flags)3040 ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
3041 {
3042 assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
3043 assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
3044 os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
3045 return 0; /* Success */
3046 }
3047
3048 #if DEVELOPMENT || DEBUG
3049 int wfe_rec_max = 0;
3050 int wfe_rec_none = 0;
3051 uint64_t wfe_rec_override_mat = 0;
3052 uint64_t wfe_rec_clamp = 0;
3053 #endif
3054
3055 uint64_t
ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)3056 ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
3057 {
3058 /* This and its consumer does not synchronize vis-a-vis updates
3059 * of the recommendation; races are acceptable.
3060 */
3061 uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
3062 #if DEVELOPMENT || DEBUG
3063 if (wfe_rec_clamp) {
3064 wfet = MIN(wfe_rec_clamp, wfet);
3065 }
3066
3067 if (wfe_rec_max) {
3068 for (int i = 0; i < MAX_CPU_CLUSTERS; i++) {
3069 if (arm64_cluster_wfe_recs[i] > wfet) {
3070 wfet = arm64_cluster_wfe_recs[i];
3071 }
3072 }
3073 }
3074
3075 if (wfe_rec_none) {
3076 wfet = 0;
3077 }
3078
3079 if (wfe_rec_override_mat) {
3080 wfet = wfe_rec_override_mat;
3081 }
3082 #endif
3083 return wfet;
3084 }
3085
3086 __pure2 bool
ml_addr_in_non_xnu_stack(__unused uintptr_t addr)3087 ml_addr_in_non_xnu_stack(__unused uintptr_t addr)
3088 {
3089 #if CONFIG_SPTM
3090 /**
3091 * If the address is within one of the SPTM-allocated per-cpu stacks, then
3092 * return true.
3093 */
3094 if ((addr >= SPTMArgs->cpu_stack_papt_start) &&
3095 (addr < SPTMArgs->cpu_stack_papt_end)) {
3096 return true;
3097 }
3098
3099 /**
3100 * If the address is within one of the TXM thread stacks, then return true.
3101 * The SPTM guarantees that these stacks are virtually contiguous.
3102 */
3103 if ((addr >= SPTMArgs->txm_thread_stacks[0]) &&
3104 (addr < SPTMArgs->txm_thread_stacks[MAX_CPUS - 1])) {
3105 return true;
3106 }
3107
3108 return false;
3109 #elif XNU_MONITOR
3110 return (addr >= (uintptr_t)pmap_stacks_start) && (addr < (uintptr_t)pmap_stacks_end);
3111 #else
3112 return false;
3113 #endif /* CONFIG_SPTM || XNU_MONITOR */
3114 }
3115
3116 uint64_t
ml_get_backtrace_pc(struct arm_saved_state * state)3117 ml_get_backtrace_pc(struct arm_saved_state *state)
3118 {
3119 assert((state != NULL) && is_saved_state64(state));
3120
3121 #if CONFIG_SPTM
3122 /**
3123 * On SPTM-based systems, when a non-XNU domain (e.g., SPTM) is interrupted,
3124 * the PC value saved into the state is not the actual PC at the interrupted
3125 * point, but a fixed value to a handler that knows how to re-enter the
3126 * interrupted domain. The interrupted domain's actual PC value is saved
3127 * into x14, so let's return that instead.
3128 */
3129 if (ml_addr_in_non_xnu_stack(get_saved_state_fp(state))) {
3130 return saved_state64(state)->x[14];
3131 }
3132 #endif /* CONFIG_SPTM */
3133
3134 return get_saved_state_pc(state);
3135 }
3136
3137
3138 /**
3139 * Panic because an ARM saved-state accessor expected user saved-state but was
3140 * passed non-user saved-state.
3141 *
3142 * @param ss invalid saved-state (CPSR.M != EL0)
3143 */
3144 void
ml_panic_on_invalid_old_cpsr(const arm_saved_state_t * ss)3145 ml_panic_on_invalid_old_cpsr(const arm_saved_state_t *ss)
3146 {
3147 panic("invalid CPSR in user saved-state %p", ss);
3148 }
3149
3150 /**
3151 * Panic because an ARM saved-state accessor was passed user saved-state and
3152 * asked to assign a non-user CPSR.
3153 *
3154 * @param ss original EL0 saved-state
3155 * @param cpsr invalid new CPSR value (CPSR.M != EL0)
3156 */
3157 void
ml_panic_on_invalid_new_cpsr(const arm_saved_state_t * ss,uint32_t cpsr)3158 ml_panic_on_invalid_new_cpsr(const arm_saved_state_t *ss, uint32_t cpsr)
3159 {
3160 panic("attempt to set non-user CPSR %#010x on user saved-state %p", cpsr, ss);
3161 }
3162
3163 #if HAS_MTE
3164
3165 #if APPLEVIRTUALPLATFORM
3166 static SECURITY_READ_ONLY_LATE(bool) have_apple_mte_tag_generator;
3167 #else
3168 static const bool have_apple_mte_tag_generator = true;
3169 #endif
3170
3171 static uint64_t
arm_mte_random_rgsr_el1_seed(void)3172 arm_mte_random_rgsr_el1_seed(void)
3173 {
3174 uint64_t seed;
3175 /*
3176 * RGSR_EL1.SEED must be non-zero. Otherwise the LFSR used during
3177 * random tag generation will just produce an endless stream of 0 bits.
3178 */
3179 do {
3180 seed = early_random();
3181 if (have_apple_mte_tag_generator) {
3182 seed &= RGSR_EL1_SEED_RRND_1_MASK;
3183 seed |= (0b111 << RGSR_EL1_SEED_OFFSET);
3184 } else {
3185 seed &= RGSR_EL1_SEED_RRND_0_MASK;
3186 }
3187 } while (seed == 0);
3188 return seed;
3189 }
3190
3191 void
arm_mte_tag_generator_init(bool is_boot_cpu)3192 arm_mte_tag_generator_init(bool is_boot_cpu)
3193 {
3194 #if APPLEVIRTUALPLATFORM
3195 if (is_boot_cpu) {
3196 uint64_t aidr_mtever = __builtin_arm_rsr64("AIDR_EL1") & AIDR_MTEVER_MASK;
3197 if (aidr_mtever == AIDR_MTEVER_V1) {
3198 have_apple_mte_tag_generator = true;
3199 }
3200 }
3201 #else
3202 #pragma unused(is_boot_cpu)
3203 #endif
3204
3205 /*
3206 * Note: ARM guarantees that all accesses to RGSR_EL1 occur in program
3207 * order relative to other instructions. So no barriers are needed to
3208 * ensure that the GCR_EL1 write is ordered before the RGSR_EL1 write,
3209 * or that the RGSR_EL1 write is ordered before any instructions that
3210 * use RGSR_EL1 to generate tags.
3211 */
3212 if (have_apple_mte_tag_generator) {
3213 uint64_t gcr_el1 = __builtin_arm_rsr64("GCR_EL1");
3214 gcr_el1 |= GCR_EL1_RRND;
3215 __builtin_arm_wsr64("GCR_EL1", gcr_el1);
3216 }
3217
3218 uint64_t seed = is_boot_cpu ? arm_mte_random_rgsr_el1_seed() : getCpuDatap()->mte_rgsr_el1_seed;
3219 __builtin_arm_wsr64("RGSR_EL1", seed);
3220 }
3221 #endif /* HAS_MTE */
3222
3223 /**
3224 * Explicitly preallocates a floating point save area.
3225 * This is a noop on ARM because preallocation isn't required at this time.
3226 */
3227 void
ml_fp_save_area_prealloc(void)3228 ml_fp_save_area_prealloc(void)
3229 {
3230 }
3231
3232
3233 void
ml_task_post_signature_processing_hook(__unused task_t task)3234 ml_task_post_signature_processing_hook(__unused task_t task)
3235 {
3236 /**
3237 * Have an acquire barrier here to make sure the machine flags read that is going
3238 * to happen below is not speculated before the task->t_returnwaitflags earlier
3239 * in task_wait_to_return().
3240 */
3241 os_atomic_thread_fence(acquire);
3242
3243 }
3244
3245 #if HAS_MTE
3246 /**
3247 * Gets a flag indicating whether a thread should have MTE tag access disabled,
3248 * even when the current map has MTE tag access enabled.
3249 *
3250 * @param thread the thread to inspect
3251 * @returns whether to override MTE tag access for this thread
3252 */
3253 bool
ml_thread_get_sec_override(thread_t thread)3254 ml_thread_get_sec_override(thread_t thread)
3255 {
3256 return thread->machine.sec_override;
3257 }
3258
3259 /**
3260 * Sets a flag on the thread to indicate that MTE tag access should be disabled,
3261 * even when the current map has MTE tag access enabled.
3262 *
3263 * @warning This function is intended to be used by `vm_map_switch_*`, where the
3264 * caller switches pmaps after setting this flag. `ml_thread_set_sec_override`
3265 * and the accompanying `pmap_switch` MUST be called together in a
3266 * preemption-disabled context.
3267 *
3268 * @note Currently this function can only safely update current_thread().
3269 *
3270 * @param thread the target thread
3271 * @param sec_override the new override setting
3272 */
3273 void
ml_thread_set_sec_override(thread_t thread,bool sec_override)3274 ml_thread_set_sec_override(thread_t thread, bool sec_override)
3275 {
3276 assert(!preemption_enabled());
3277 assert(thread == current_thread());
3278 thread->machine.sec_override = sec_override;
3279 }
3280 #endif /* HAS_MTE */
3281
3282 #if DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT
3283 static bool SECURITY_READ_ONLY_LATE(_unsafe_kernel_text_initialized) = false;
3284 static bool SECURITY_READ_ONLY_LATE(_unsafe_kernel_text) = false;
3285
3286 __mockable bool
ml_unsafe_kernel_text(void)3287 ml_unsafe_kernel_text(void)
3288 {
3289 assert(_unsafe_kernel_text_initialized);
3290 return _unsafe_kernel_text;
3291 }
3292
3293 __startup_func
3294 static void
ml_unsafe_kernel_text_init(void)3295 ml_unsafe_kernel_text_init(void)
3296 {
3297 /* Grab the values written by iBoot. */
3298
3299 DTEntry entry;
3300 const void *value;
3301 unsigned int size;
3302 if (SecureDTLookupEntry(0, "/chosen", &entry) == kSuccess &&
3303 SecureDTGetProperty(entry, "kernel-ctrr-to-be-enabled", &value, &size) == kSuccess &&
3304 size == sizeof(int)) {
3305 _unsafe_kernel_text_initialized = true;
3306 _unsafe_kernel_text = (0 == *(const int *)value);
3307 }
3308 }
3309 STARTUP(TUNABLES, STARTUP_RANK_FIRST, ml_unsafe_kernel_text_init);
3310
3311 #else /* DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT */
3312 bool
ml_unsafe_kernel_text(void)3313 ml_unsafe_kernel_text(void)
3314 {
3315 /* Kernel text is never writable under these configs. */
3316 return false;
3317 }
3318 #endif /* DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT */
3319