1 /*
2 * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <arm64/machine_machdep.h>
30 #include <arm64/proc_reg.h>
31 #include <arm/machine_cpu.h>
32 #include <arm/cpu_internal.h>
33 #include <arm/cpuid.h>
34 #include <arm/cpu_data.h>
35 #include <arm/cpu_data_internal.h>
36 #include <arm/caches_internal.h>
37 #include <arm/misc_protos.h>
38 #include <arm/machdep_call.h>
39 #include <arm/machine_routines.h>
40 #include <arm/rtclock.h>
41 #include <arm/cpuid_internal.h>
42 #include <arm/cpu_capabilities.h>
43 #include <console/serial_protos.h>
44 #include <kern/machine.h>
45 #include <kern/misc_protos.h>
46 #include <prng/random.h>
47 #include <kern/startup.h>
48 #include <kern/thread.h>
49 #include <kern/timer_queue.h>
50 #include <mach/machine.h>
51 #include <machine/atomic.h>
52 #include <machine/config.h>
53 #include <vm/pmap.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_shared_region.h>
56 #include <vm/vm_map.h>
57 #include <sys/codesign.h>
58 #include <sys/kdebug.h>
59 #include <kern/coalition.h>
60 #include <pexpert/device_tree.h>
61
62 #include <IOKit/IOPlatformExpert.h>
63 #if HIBERNATION
64 #include <IOKit/IOHibernatePrivate.h>
65 #endif /* HIBERNATION */
66
67 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
68 #include <arm64/amcc_rorgn.h>
69 #endif
70
71
72
73 #include <libkern/section_keywords.h>
74
75 /**
76 * On supported hardware, debuggable builds make the HID bits read-only
77 * without locking them. This lets people manually modify HID bits while
78 * debugging, since they can use a debugging tool to first reset the HID
79 * bits back to read/write. However it will still catch xnu changes that
80 * accidentally write to HID bits after they've been made read-only.
81 */
82
83 #if KPC
84 #include <kern/kpc.h>
85 #endif
86
87 #define MPIDR_CPU_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
88 #define MPIDR_CLUSTER_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
89
90 #if HAS_CLUSTER
91 static uint8_t cluster_initialized = 0;
92 #endif
93
94 MACHINE_TIMEOUT_DEV_WRITEABLE(LockTimeOut, "lock", 6e6 /* 0.25s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
95 machine_timeout_t LockTimeOutUsec; // computed in ml_init_lock_timeout
96
97 MACHINE_TIMEOUT_DEV_WRITEABLE(TLockTimeOut, "ticket-lock", 3e6 /* 0.125s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
98
99 MACHINE_TIMEOUT_DEV_WRITEABLE(MutexSpin, "mutex-spin", 240 /* 10us */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
100
101 uint64_t low_MutexSpin;
102 int64_t high_MutexSpin;
103
104
105
106 static uint64_t ml_wfe_hint_max_interval;
107 #define MAX_WFE_HINT_INTERVAL_US (500ULL)
108
109 /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
110 TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
111
112 extern vm_offset_t segLOWEST;
113 extern vm_offset_t segLOWESTTEXT;
114 extern vm_offset_t segLASTB;
115 extern unsigned long segSizeLAST;
116
117 /* ARM64 specific bounds; used to test for presence in the kernelcache. */
118 extern vm_offset_t vm_kernelcache_base;
119 extern vm_offset_t vm_kernelcache_top;
120
121 extern vm_offset_t arm_vm_kernelcache_phys_start;
122 extern vm_offset_t arm_vm_kernelcache_phys_end;
123
124 #if defined(HAS_IPI)
125 unsigned int gFastIPI = 1;
126 #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
127 static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
128 kDeferredIPITimerDefault);
129 #endif /* defined(HAS_IPI) */
130
131 thread_t Idle_context(void);
132
133 SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
134
135 SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
136 SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
137 SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
138 .version = CPU_TOPOLOGY_VERSION,
139 .cpus = topology_cpu_array,
140 .clusters = topology_cluster_array,
141 };
142
143 _Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
144
145 /**
146 * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
147 * entries of an arbitrary data type. This is intended for use by specialized consumers
148 * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
149 * as follows:
150 * hypothetical_array[cluster_offsets[AFF1] + AFF0]
151 * Most consumers should instead use general-purpose facilities such as PERCPU or
152 * ml_get_cpu_number().
153 */
154 SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
155
156 SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
157
158 extern uint32_t lockdown_done;
159
160 /**
161 * Represents regions of virtual address space that should be reserved
162 * (pre-mapped) in each user address space.
163 */
164 static const struct vm_reserved_region vm_reserved_regions[] = {
165 {
166 .vmrr_name = "GPU Carveout",
167 .vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
168 .vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
169 },
170 /*
171 * Reserve the virtual memory space representing the commpage nesting region
172 * to prevent user processes from allocating memory within it. The actual
173 * page table entries for the commpage are inserted by vm_commpage_enter().
174 * This vm_map_enter() just prevents userspace from allocating/deallocating
175 * anything within the entire commpage nested region.
176 */
177 {
178 .vmrr_name = "commpage nesting",
179 .vmrr_addr = _COMM_PAGE64_NESTING_START,
180 .vmrr_size = _COMM_PAGE64_NESTING_SIZE
181 }
182 };
183
184 uint32_t get_arm_cpu_version(void);
185
186 #if defined(HAS_IPI)
187 static inline void
ml_cpu_signal_type(unsigned int cpu_mpidr,uint32_t type)188 ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
189 {
190 #if HAS_CLUSTER
191 uint64_t local_mpidr;
192 /* NOTE: this logic expects that we are called in a non-preemptible
193 * context, or at least one in which the calling thread is bound
194 * to a single CPU. Otherwise we may migrate between choosing which
195 * IPI mechanism to use and issuing the IPI. */
196 MRS(local_mpidr, "MPIDR_EL1");
197 if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
198 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
199 MSR("S3_5_C15_C0_0", x);
200 } else {
201 #define IPI_RR_TARGET_CLUSTER_SHIFT 16
202 uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
203 MSR("S3_5_C15_C0_1", x);
204 }
205 #else
206 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
207 MSR("S3_5_C15_C0_1", x);
208 #endif
209 }
210 #endif
211
212 #if !defined(HAS_IPI)
213 __dead2
214 #endif
215 void
ml_cpu_signal(unsigned int cpu_mpidr __unused)216 ml_cpu_signal(unsigned int cpu_mpidr __unused)
217 {
218 #if defined(HAS_IPI)
219 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
220 #else
221 panic("Platform does not support ACC Fast IPI");
222 #endif
223 }
224
225 #if !defined(HAS_IPI)
226 __dead2
227 #endif
228 void
ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)229 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
230 {
231 #if defined(HAS_IPI)
232 /* adjust IPI_CR timer countdown value for deferred IPI
233 * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
234 * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
235 *
236 * global register, should only require a single write to update all
237 * CPU cores: from Skye ACC user spec section 5.7.3.3
238 *
239 * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
240 * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
241 */
242 uint64_t abstime;
243
244 nanoseconds_to_absolutetime(nanosecs, &abstime);
245
246 abstime = MIN(abstime, 0xFFFF);
247
248 /* update deferred_ipi_timer_ns with the new clamped value */
249 absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
250
251 MSR("S3_5_C15_C3_1", abstime);
252 #else
253 (void)nanosecs;
254 panic("Platform does not support ACC Fast IPI");
255 #endif
256 }
257
258 uint64_t
ml_cpu_signal_deferred_get_timer()259 ml_cpu_signal_deferred_get_timer()
260 {
261 #if defined(HAS_IPI)
262 return deferred_ipi_timer_ns;
263 #else
264 return 0;
265 #endif
266 }
267
268 #if !defined(HAS_IPI)
269 __dead2
270 #endif
271 void
ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)272 ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
273 {
274 #if defined(HAS_IPI)
275 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
276 #else
277 panic("Platform does not support ACC Fast IPI deferral");
278 #endif
279 }
280
281 #if !defined(HAS_IPI)
282 __dead2
283 #endif
284 void
ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)285 ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
286 {
287 #if defined(HAS_IPI)
288 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
289 #else
290 panic("Platform does not support ACC Fast IPI retraction");
291 #endif
292 }
293
294 extern uint32_t idle_proximate_io_wfe_unmasked;
295
296 #define CPUPM_IDLE_WFE 0x5310300
297 static bool
wfe_process_recommendation(void)298 wfe_process_recommendation(void)
299 {
300 bool ipending = false;
301 if (__probable(idle_proximate_io_wfe_unmasked == 1)) {
302 /* Check for an active perf. controller generated
303 * WFE recommendation for this cluster.
304 */
305 cpu_data_t *cdp = getCpuDatap();
306 uint32_t cid = cdp->cpu_cluster_id;
307 uint64_t wfe_ttd = 0;
308 uint64_t wfe_deadline = 0;
309
310 if ((wfe_ttd = ml_cluster_wfe_timeout(cid)) != 0) {
311 wfe_deadline = mach_absolute_time() + wfe_ttd;
312 }
313
314 if (wfe_deadline != 0) {
315 /* Poll issuing event-bounded WFEs until an interrupt
316 * arrives or the WFE recommendation expires
317 */
318 #if DEVELOPMENT || DEBUG
319 uint64_t wc = cdp->wfe_count;
320 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
321 #endif
322 /* Issue WFE until the recommendation expires,
323 * with IRQs unmasked.
324 */
325 ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true);
326 #if DEVELOPMENT || DEBUG
327 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
328 #endif
329 }
330 }
331 return ipending;
332 }
333
334 void
machine_idle(void)335 machine_idle(void)
336 {
337 /* Interrupts are expected to be masked on entry or re-entry via
338 * Idle_load_context()
339 */
340 assert((__builtin_arm_rsr("DAIF") & (DAIF_IRQF | DAIF_FIQF)) == (DAIF_IRQF | DAIF_FIQF));
341 /* Check for, and act on, a WFE recommendation.
342 * Bypasses context spill/fill for a minor perf. increment.
343 * May unmask and restore IRQ+FIQ mask.
344 */
345 if (wfe_process_recommendation() == false) {
346 /* If WFE recommendation absent, or WFE deadline
347 * arrived with no interrupt pending/processed,
348 * fall back to WFI.
349 */
350 Idle_context();
351 }
352 __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
353 }
354
355 void
OSSynchronizeIO(void)356 OSSynchronizeIO(void)
357 {
358 __builtin_arm_dsb(DSB_SY);
359 }
360
361 uint64_t
get_aux_control(void)362 get_aux_control(void)
363 {
364 uint64_t value;
365
366 MRS(value, "ACTLR_EL1");
367 return value;
368 }
369
370 uint64_t
get_mmu_control(void)371 get_mmu_control(void)
372 {
373 uint64_t value;
374
375 MRS(value, "SCTLR_EL1");
376 return value;
377 }
378
379 uint64_t
get_tcr(void)380 get_tcr(void)
381 {
382 uint64_t value;
383
384 MRS(value, "TCR_EL1");
385 return value;
386 }
387
388 boolean_t
ml_get_interrupts_enabled(void)389 ml_get_interrupts_enabled(void)
390 {
391 uint64_t value;
392
393 MRS(value, "DAIF");
394 if (value & DAIF_IRQF) {
395 return FALSE;
396 }
397 return TRUE;
398 }
399
400 pmap_paddr_t
get_mmu_ttb(void)401 get_mmu_ttb(void)
402 {
403 pmap_paddr_t value;
404
405 MRS(value, "TTBR0_EL1");
406 return value;
407 }
408
409 uint32_t
get_arm_cpu_version(void)410 get_arm_cpu_version(void)
411 {
412 uint32_t value = machine_read_midr();
413
414 /* Compose the register values into 8 bits; variant[7:4], revision[3:0]. */
415 return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
416 }
417
418 bool
ml_feature_supported(uint32_t feature_bit)419 ml_feature_supported(uint32_t feature_bit)
420 {
421 uint64_t aidr_el1_value = 0;
422
423 MRS(aidr_el1_value, "AIDR_EL1");
424
425
426 return aidr_el1_value & feature_bit;
427 }
428
429 /*
430 * user_cont_hwclock_allowed()
431 *
432 * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
433 * as a continuous time source (e.g. from mach_continuous_time)
434 */
435 boolean_t
user_cont_hwclock_allowed(void)436 user_cont_hwclock_allowed(void)
437 {
438 #if HAS_CONTINUOUS_HWCLOCK
439 return TRUE;
440 #else
441 return FALSE;
442 #endif
443 }
444
445 /*
446 * user_timebase_type()
447 *
448 * Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
449 *
450 * USER_TIMEBASE_NONE: EL0 has no access to timebase register
451 * USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
452 * USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
453 *
454 */
455
456 uint8_t
user_timebase_type(void)457 user_timebase_type(void)
458 {
459 #if HAS_ACNTVCT
460 return USER_TIMEBASE_NOSPEC_APPLE;
461 #elif __ARM_ARCH_8_6__
462 return USER_TIMEBASE_NOSPEC;
463 #else
464 return USER_TIMEBASE_SPEC;
465 #endif
466 }
467
468 void
machine_startup(__unused boot_args * args)469 machine_startup(__unused boot_args * args)
470 {
471 #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
472 if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
473 gFastIPI = 1;
474 }
475 #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
476
477
478 machine_conf();
479
480
481 /*
482 * Kick off the kernel bootstrap.
483 */
484 kernel_bootstrap();
485 /* NOTREACHED */
486 }
487
488 typedef void (*invalidate_fn_t)(void);
489
490 static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
491
492 void set_invalidate_hmac_function(invalidate_fn_t fn);
493
494 void
set_invalidate_hmac_function(invalidate_fn_t fn)495 set_invalidate_hmac_function(invalidate_fn_t fn)
496 {
497 if (NULL != invalidate_hmac_function) {
498 panic("Invalidate HMAC function already set");
499 }
500
501 invalidate_hmac_function = fn;
502 }
503
504 void
machine_lockdown(void)505 machine_lockdown(void)
506 {
507 arm_vm_prot_finalize(PE_state.bootArgs);
508
509 #if CONFIG_KERNEL_INTEGRITY
510 #if KERNEL_INTEGRITY_WT
511 /* Watchtower
512 *
513 * Notify the monitor about the completion of early kernel bootstrap.
514 * From this point forward it will enforce the integrity of kernel text,
515 * rodata and page tables.
516 */
517
518 #ifdef MONITOR
519 monitor_call(MONITOR_LOCKDOWN, 0, 0, 0);
520 #endif
521 #endif /* KERNEL_INTEGRITY_WT */
522
523 #if XNU_MONITOR
524 pmap_lockdown_ppl();
525 #endif
526
527 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
528 /* KTRR
529 *
530 * Lock physical KTRR region. KTRR region is read-only. Memory outside
531 * the region is not executable at EL1.
532 */
533
534 rorgn_lockdown();
535 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
536
537 #endif /* CONFIG_KERNEL_INTEGRITY */
538
539
540 if (NULL != invalidate_hmac_function) {
541 invalidate_hmac_function();
542 }
543
544 lockdown_done = 1;
545 }
546
547
548 char *
machine_boot_info(__unused char * buf,__unused vm_size_t size)549 machine_boot_info(
550 __unused char *buf,
551 __unused vm_size_t size)
552 {
553 return PE_boot_args();
554 }
555
556 void
slave_machine_init(__unused void * param)557 slave_machine_init(__unused void *param)
558 {
559 cpu_machine_init(); /* Initialize the processor */
560 clock_init(); /* Init the clock */
561 }
562
563 /*
564 * Routine: machine_processor_shutdown
565 * Function:
566 */
567 thread_t
machine_processor_shutdown(__unused thread_t thread,void (* doshutdown)(processor_t),processor_t processor)568 machine_processor_shutdown(
569 __unused thread_t thread,
570 void (*doshutdown)(processor_t),
571 processor_t processor)
572 {
573 return Shutdown_context(doshutdown, processor);
574 }
575
576 /*
577 * Routine: ml_init_lock_timeout
578 * Function:
579 */
580 static void __startup_func
ml_init_lock_timeout(void)581 ml_init_lock_timeout(void)
582 {
583 /*
584 * This function is called after STARTUP_SUB_TIMEOUTS
585 * initialization, so using the "legacy" boot-args here overrides
586 * the ml-timeout-... configuration. (Given that these boot-args
587 * here are usually explicitly specified, this makes sense by
588 * overriding ml-timeout-..., which may come from the device tree.
589 */
590
591 uint64_t lto_timeout_ns;
592 uint64_t lto_abstime;
593 uint32_t slto;
594
595 if (PE_parse_boot_argn("slto_us", &slto, sizeof(slto))) {
596 lto_timeout_ns = slto * NSEC_PER_USEC;
597 nanoseconds_to_absolutetime(lto_timeout_ns, <o_abstime);
598 os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
599 } else {
600 lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
601 absolutetime_to_nanoseconds(lto_abstime, <o_timeout_ns);
602 }
603
604 os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
605
606 if (PE_parse_boot_argn("tlto_us", &slto, sizeof(slto))) {
607 nanoseconds_to_absolutetime(slto * NSEC_PER_USEC, <o_abstime);
608 os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
609 } else if (lto_abstime != 0) {
610 os_atomic_store(&TLockTimeOut, lto_abstime >> 1, relaxed);
611 } // else take default from MACHINE_TIMEOUT.
612
613 uint64_t mtxspin;
614 uint64_t mtx_abstime;
615 if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof(mtxspin))) {
616 if (mtxspin > USEC_PER_SEC >> 4) {
617 mtxspin = USEC_PER_SEC >> 4;
618 }
619 nanoseconds_to_absolutetime(mtxspin * NSEC_PER_USEC, &mtx_abstime);
620 os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
621 } else {
622 mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
623 }
624
625 low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
626 /*
627 * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
628 * real_ncpus is not set at this time
629 *
630 * NOTE: active spinning is disabled in arm. It can be activated
631 * by setting high_MutexSpin through the sysctl.
632 */
633 high_MutexSpin = low_MutexSpin;
634
635 uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
636 PE_parse_boot_argn("max_wfe_us", &maxwfeus, sizeof(maxwfeus));
637 nanoseconds_to_absolutetime(maxwfeus * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
638 }
639 STARTUP(TIMEOUTS, STARTUP_RANK_MIDDLE, ml_init_lock_timeout);
640
641
642 /*
643 * This is called when all of the ml_processor_info_t structures have been
644 * initialized and all the processors have been started through processor_start().
645 *
646 * Required by the scheduler subsystem.
647 */
648 void
ml_cpu_init_completed(void)649 ml_cpu_init_completed(void)
650 {
651 if (SCHED(cpu_init_completed) != NULL) {
652 SCHED(cpu_init_completed)();
653 }
654 }
655
656 /*
657 * These are called from the machine-independent routine cpu_up()
658 * to perform machine-dependent info updates.
659 *
660 * The update to CPU counts needs to be separate from other actions
661 * because we don't update the counts when CLPC causes temporary
662 * cluster powerdown events, as these must be transparent to the user.
663 */
664 void
ml_cpu_up(void)665 ml_cpu_up(void)
666 {
667 }
668
669 void
ml_cpu_up_update_counts(int cpu_id)670 ml_cpu_up_update_counts(int cpu_id)
671 {
672 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
673
674 os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
675
676 os_atomic_inc(&machine_info.physical_cpu, relaxed);
677 os_atomic_inc(&machine_info.logical_cpu, relaxed);
678 }
679
680 /*
681 * These are called from the machine-independent routine cpu_down()
682 * to perform machine-dependent info updates.
683 *
684 * The update to CPU counts needs to be separate from other actions
685 * because we don't update the counts when CLPC causes temporary
686 * cluster powerdown events, as these must be transparent to the user.
687 */
688 void
ml_cpu_down(void)689 ml_cpu_down(void)
690 {
691 /*
692 * If we want to deal with outstanding IPIs, we need to
693 * do relatively early in the processor_doshutdown path,
694 * as we pend decrementer interrupts using the IPI
695 * mechanism if we cannot immediately service them (if
696 * IRQ is masked). Do so now.
697 *
698 * We aren't on the interrupt stack here; would it make
699 * more sense to disable signaling and then enable
700 * interrupts? It might be a bit cleaner.
701 */
702 cpu_data_t *cpu_data_ptr = getCpuDatap();
703 cpu_data_ptr->cpu_running = FALSE;
704
705 if (cpu_data_ptr != &BootCpuData) {
706 /*
707 * Move all of this cpu's timers to the master/boot cpu,
708 * and poke it in case there's a sooner deadline for it to schedule.
709 */
710 timer_queue_shutdown(&cpu_data_ptr->rtclock_timer.queue);
711 kern_return_t rv = cpu_xcall(BootCpuData.cpu_number, &timer_queue_expire_local, &ml_cpu_down);
712 if (rv != KERN_SUCCESS) {
713 panic("ml_cpu_down: IPI failure %d", rv);
714 }
715 }
716
717 cpu_signal_handler_internal(TRUE);
718 }
719 void
ml_cpu_down_update_counts(int cpu_id)720 ml_cpu_down_update_counts(int cpu_id)
721 {
722 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
723
724 os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
725
726 os_atomic_dec(&machine_info.physical_cpu, relaxed);
727 os_atomic_dec(&machine_info.logical_cpu, relaxed);
728 }
729
730
731 unsigned int
ml_get_machine_mem(void)732 ml_get_machine_mem(void)
733 {
734 return machine_info.memory_size;
735 }
736
737 __attribute__((noreturn))
738 void
halt_all_cpus(boolean_t reboot)739 halt_all_cpus(boolean_t reboot)
740 {
741 if (reboot) {
742 printf("MACH Reboot\n");
743 PEHaltRestart(kPERestartCPU);
744 } else {
745 printf("CPU halted\n");
746 PEHaltRestart(kPEHaltCPU);
747 }
748 while (1) {
749 ;
750 }
751 }
752
753 __attribute__((noreturn))
754 void
halt_cpu(void)755 halt_cpu(void)
756 {
757 halt_all_cpus(FALSE);
758 }
759
760 /*
761 * Routine: machine_signal_idle
762 * Function:
763 */
764 void
machine_signal_idle(processor_t processor)765 machine_signal_idle(
766 processor_t processor)
767 {
768 cpu_signal(processor_to_cpu_datap(processor), SIGPnop, (void *)NULL, (void *)NULL);
769 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
770 }
771
772 void
machine_signal_idle_deferred(processor_t processor)773 machine_signal_idle_deferred(
774 processor_t processor)
775 {
776 cpu_signal_deferred(processor_to_cpu_datap(processor));
777 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
778 }
779
780 void
machine_signal_idle_cancel(processor_t processor)781 machine_signal_idle_cancel(
782 processor_t processor)
783 {
784 cpu_signal_cancel(processor_to_cpu_datap(processor));
785 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
786 }
787
788 /*
789 * Routine: ml_install_interrupt_handler
790 * Function: Initialize Interrupt Handler
791 */
792 void
ml_install_interrupt_handler(void * nub,int source,void * target,IOInterruptHandler handler,void * refCon)793 ml_install_interrupt_handler(
794 void *nub,
795 int source,
796 void *target,
797 IOInterruptHandler handler,
798 void *refCon)
799 {
800 cpu_data_t *cpu_data_ptr;
801 boolean_t current_state;
802
803 current_state = ml_set_interrupts_enabled(FALSE);
804 cpu_data_ptr = getCpuDatap();
805
806 cpu_data_ptr->interrupt_nub = nub;
807 cpu_data_ptr->interrupt_source = source;
808 cpu_data_ptr->interrupt_target = target;
809 cpu_data_ptr->interrupt_handler = handler;
810 cpu_data_ptr->interrupt_refCon = refCon;
811
812 (void) ml_set_interrupts_enabled(current_state);
813 }
814
815 /*
816 * Routine: ml_init_interrupt
817 * Function: Initialize Interrupts
818 */
819 void
ml_init_interrupt(void)820 ml_init_interrupt(void)
821 {
822 #if defined(HAS_IPI)
823 /*
824 * ml_init_interrupt will get called once for each CPU, but this is redundant
825 * because there is only one global copy of the register for skye. do it only
826 * on the bootstrap cpu
827 */
828 if (getCpuDatap()->cluster_master) {
829 ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
830 }
831 #endif
832 }
833
834 /*
835 * Routine: ml_init_timebase
836 * Function: register and setup Timebase, Decremeter services
837 */
838 void
ml_init_timebase(void * args,tbd_ops_t tbd_funcs,vm_offset_t int_address,vm_offset_t int_value __unused)839 ml_init_timebase(
840 void *args,
841 tbd_ops_t tbd_funcs,
842 vm_offset_t int_address,
843 vm_offset_t int_value __unused)
844 {
845 cpu_data_t *cpu_data_ptr;
846
847 cpu_data_ptr = (cpu_data_t *)args;
848
849 if ((cpu_data_ptr == &BootCpuData)
850 && (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
851 rtclock_timebase_func = *tbd_funcs;
852 rtclock_timebase_addr = int_address;
853 }
854 }
855
856 #define ML_READPROP_MANDATORY UINT64_MAX
857
858 static uint64_t
ml_readprop(const DTEntry entry,const char * propertyName,uint64_t default_value)859 ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
860 {
861 void const *prop;
862 unsigned int propSize;
863
864 if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) {
865 if (propSize == sizeof(uint8_t)) {
866 return *((uint8_t const *)prop);
867 } else if (propSize == sizeof(uint16_t)) {
868 return *((uint16_t const *)prop);
869 } else if (propSize == sizeof(uint32_t)) {
870 return *((uint32_t const *)prop);
871 } else if (propSize == sizeof(uint64_t)) {
872 return *((uint64_t const *)prop);
873 } else {
874 panic("CPU property '%s' has bad size %u", propertyName, propSize);
875 }
876 } else {
877 if (default_value == ML_READPROP_MANDATORY) {
878 panic("Missing mandatory property '%s'", propertyName);
879 }
880 return default_value;
881 }
882 }
883
884 static boolean_t
ml_read_reg_range(const DTEntry entry,const char * propertyName,uint64_t * pa_ptr,uint64_t * len_ptr)885 ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
886 {
887 uint64_t const *prop;
888 unsigned int propSize;
889
890 if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) {
891 return FALSE;
892 }
893
894 if (propSize != sizeof(uint64_t) * 2) {
895 panic("Wrong property size for %s", propertyName);
896 }
897
898 *pa_ptr = prop[0];
899 *len_ptr = prop[1];
900 return TRUE;
901 }
902
903 static boolean_t
ml_is_boot_cpu(const DTEntry entry)904 ml_is_boot_cpu(const DTEntry entry)
905 {
906 void const *prop;
907 unsigned int propSize;
908
909 if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) {
910 panic("unable to retrieve state for cpu");
911 }
912
913 if (strncmp((char const *)prop, "running", propSize) == 0) {
914 return TRUE;
915 } else {
916 return FALSE;
917 }
918 }
919
920 static void
ml_read_chip_revision(unsigned int * rev __unused)921 ml_read_chip_revision(unsigned int *rev __unused)
922 {
923 // The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
924 #ifdef APPLE_ARM64_ARCH_FAMILY
925 DTEntry entryP;
926
927 if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) {
928 *rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN);
929 } else {
930 *rev = CPU_VERSION_UNKNOWN;
931 }
932 #endif
933 }
934
935 void
ml_parse_cpu_topology(void)936 ml_parse_cpu_topology(void)
937 {
938 DTEntry entry, child __unused;
939 OpaqueDTEntryIterator iter;
940 uint32_t cpu_boot_arg = MAX_CPUS;
941 uint64_t cpumask_boot_arg = ULLONG_MAX;
942 int err;
943
944 int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
945 int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
946 const boolean_t cpus_boot_arg_present = PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg));
947 const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn("cpumask", &cpumask_boot_arg, sizeof(cpumask_boot_arg));
948
949 // The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
950 // so that we trigger a panic later in the boot process, once serial is enabled.
951 if (cpus_boot_arg_present && cpumask_boot_arg_present) {
952 cpu_config_correct = false;
953 }
954
955 err = SecureDTLookupEntry(NULL, "/cpus", &entry);
956 assert(err == kSuccess);
957
958 err = SecureDTInitEntryIterator(entry, &iter);
959 assert(err == kSuccess);
960
961 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
962 cluster_offsets[i] = -1;
963 cluster_phys_to_logical[i] = -1;
964 cluster_max_cpu_phys_id[i] = 0;
965 }
966
967 while (kSuccess == SecureDTIterateEntries(&iter, &child)) {
968 boolean_t is_boot_cpu = ml_is_boot_cpu(child);
969 boolean_t cpu_enabled = cpumask_boot_arg & 1;
970 cpumask_boot_arg >>= 1;
971
972 // Boot CPU disabled in cpumask. Flag this so that we trigger a panic
973 // later in the boot process, once serial is enabled.
974 if (is_boot_cpu && !cpu_enabled) {
975 cpu_config_correct = false;
976 }
977
978 // Ignore this CPU if it has been disabled by the cpumask= boot-arg.
979 if (!is_boot_cpu && !cpu_enabled) {
980 continue;
981 }
982
983 // If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
984 // been added to the topology struct yet, and we only have one slot left, then skip
985 // every other non-boot CPU in order to leave room for the boot CPU.
986 //
987 // e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
988 // array will list CPU0, CPU1, and CPU4. CPU2-CPU3 and CPU5-CPUn will be omitted.
989 if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
990 continue;
991 }
992 if (topology_info.num_cpus >= cpu_boot_arg) {
993 break;
994 }
995
996 ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
997
998 cpu->cpu_id = topology_info.num_cpus++;
999 assert(cpu->cpu_id < MAX_CPUS);
1000 topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
1001
1002 cpu->die_id = 0;
1003 topology_info.max_die_id = 0;
1004
1005 cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY);
1006
1007 cpu->l2_access_penalty = (uint32_t)ml_readprop(child, "l2-access-penalty", 0);
1008 cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0);
1009 cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0);
1010 cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
1011 cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
1012
1013 ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
1014 ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
1015 ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
1016 cpu->cluster_type = CLUSTER_TYPE_SMP;
1017
1018 int cluster_type = (int)ml_readprop(child, "cluster-type", 0);
1019 if (cluster_type == 'E') {
1020 cpu->cluster_type = CLUSTER_TYPE_E;
1021 } else if (cluster_type == 'P') {
1022 cpu->cluster_type = CLUSTER_TYPE_P;
1023 }
1024
1025 topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1026
1027 /*
1028 * Since we want to keep a linear cluster ID space, we cannot just rely
1029 * on the value provided by EDT. Instead, use the MPIDR value to see if we have
1030 * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1031 */
1032 #if HAS_CLUSTER
1033 uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1034 #else
1035 uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1036 #endif
1037 assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1038 cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
1039 topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1040
1041 assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1042
1043 ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1044 if (cluster->num_cpus == 0) {
1045 assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1046
1047 topology_info.num_clusters++;
1048 topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1049 topology_info.cluster_types |= (1 << cpu->cluster_type);
1050
1051 cluster->cluster_id = cpu->cluster_id;
1052 cluster->cluster_type = cpu->cluster_type;
1053 cluster->first_cpu_id = cpu->cpu_id;
1054 assert(cluster_phys_to_logical[phys_cluster_id] == -1);
1055 cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1056
1057 topology_info.cluster_type_num_clusters[cluster->cluster_type]++;
1058
1059 // Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1060 // If we wind up with a bunch of these, we might want to create separate per-cluster
1061 // EDT nodes and have the CPU nodes reference them through a phandle.
1062 ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len);
1063 ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len);
1064 }
1065
1066 #if HAS_CLUSTER
1067 if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1068 cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1069 }
1070 #endif
1071
1072 cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1073 cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1074
1075 cluster->num_cpus++;
1076 cluster->cpu_mask |= 1ULL << cpu->cpu_id;
1077
1078 if (is_boot_cpu) {
1079 assert(topology_info.boot_cpu == NULL);
1080 topology_info.boot_cpu = cpu;
1081 topology_info.boot_cluster = cluster;
1082 }
1083
1084 }
1085
1086 #if HAS_CLUSTER
1087 /*
1088 * Build the cluster offset array, ensuring that the region reserved
1089 * for each physical cluster contains enough entries to be indexed
1090 * by the maximum physical CPU ID (AFF0) within the cluster.
1091 */
1092 unsigned int cur_cluster_offset = 0;
1093 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1094 if (cluster_phys_to_logical[i] != -1) {
1095 cluster_offsets[i] = cur_cluster_offset;
1096 cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
1097 }
1098 }
1099 assert(cur_cluster_offset <= MAX_CPUS);
1100 #else
1101 /*
1102 * For H10, there are really 2 physical clusters, but they are not separated
1103 * into distinct ACCs. AFF1 therefore always reports 0, and AFF0 numbering
1104 * is linear across both clusters. For the purpose of MPIDR_EL1-based indexing,
1105 * treat H10 and earlier devices as though they contain a single cluster.
1106 */
1107 cluster_offsets[0] = 0;
1108 #endif
1109 assert(topology_info.boot_cpu != NULL);
1110 ml_read_chip_revision(&topology_info.chip_revision);
1111
1112 /*
1113 * Set TPIDR_EL0 to indicate the correct cpu number, as we may
1114 * not be booting from cpu 0. Userspace will consume the current
1115 * CPU number through this register. For non-boot cores, this is
1116 * done in start.s (start_cpu) using the cpu_number field of the
1117 * per-cpu data object.
1118 */
1119 uint64_t cpuid = topology_info.boot_cpu->cpu_id;
1120
1121 __builtin_arm_wsr64("TPIDR_EL0", cpuid & MACHDEP_TPIDR_CPUNUM_MASK);
1122 assert((cpuid & MACHDEP_TPIDR_CPUNUM_MASK) == cpuid);
1123 __builtin_arm_wsr64("TPIDRRO_EL0", 0);
1124 }
1125
1126 const ml_topology_info_t *
ml_get_topology_info(void)1127 ml_get_topology_info(void)
1128 {
1129 return &topology_info;
1130 }
1131
1132 void
ml_map_cpu_pio(void)1133 ml_map_cpu_pio(void)
1134 {
1135 unsigned int i;
1136
1137 for (i = 0; i < topology_info.num_cpus; i++) {
1138 ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1139 if (cpu->cpu_IMPL_pa) {
1140 cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len);
1141 cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len);
1142 }
1143 if (cpu->cpu_UTTDBG_pa) {
1144 cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len);
1145 }
1146 }
1147
1148 for (i = 0; i < topology_info.num_clusters; i++) {
1149 ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1150 if (cluster->acc_IMPL_pa) {
1151 cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len);
1152 }
1153 if (cluster->cpm_IMPL_pa) {
1154 cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len);
1155 }
1156 }
1157 }
1158
1159 unsigned int
ml_get_cpu_count(void)1160 ml_get_cpu_count(void)
1161 {
1162 return topology_info.num_cpus;
1163 }
1164
1165 unsigned int
ml_get_cluster_count(void)1166 ml_get_cluster_count(void)
1167 {
1168 return topology_info.num_clusters;
1169 }
1170
1171 int
ml_get_boot_cpu_number(void)1172 ml_get_boot_cpu_number(void)
1173 {
1174 return topology_info.boot_cpu->cpu_id;
1175 }
1176
1177 cluster_type_t
ml_get_boot_cluster_type(void)1178 ml_get_boot_cluster_type(void)
1179 {
1180 return topology_info.boot_cluster->cluster_type;
1181 }
1182
1183 int
ml_get_cpu_number(uint32_t phys_id)1184 ml_get_cpu_number(uint32_t phys_id)
1185 {
1186 phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
1187
1188 for (unsigned i = 0; i < topology_info.num_cpus; i++) {
1189 if (topology_info.cpus[i].phys_id == phys_id) {
1190 return i;
1191 }
1192 }
1193
1194 return -1;
1195 }
1196
1197 int
ml_get_cluster_number(uint32_t phys_id)1198 ml_get_cluster_number(uint32_t phys_id)
1199 {
1200 int cpu_id = ml_get_cpu_number(phys_id);
1201 if (cpu_id < 0) {
1202 return -1;
1203 }
1204
1205 ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1206
1207 return cpu->cluster_id;
1208 }
1209
1210 unsigned int
ml_get_cpu_number_local(void)1211 ml_get_cpu_number_local(void)
1212 {
1213 uint64_t mpidr_el1_value = 0;
1214 unsigned cpu_id;
1215
1216 /* We identify the CPU based on the constant bits of MPIDR_EL1. */
1217 MRS(mpidr_el1_value, "MPIDR_EL1");
1218 cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value);
1219
1220 assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1221
1222 return cpu_id;
1223 }
1224
1225 int
ml_get_cluster_number_local()1226 ml_get_cluster_number_local()
1227 {
1228 uint64_t mpidr_el1_value = 0;
1229 unsigned cluster_id;
1230
1231 /* We identify the cluster based on the constant bits of MPIDR_EL1. */
1232 MRS(mpidr_el1_value, "MPIDR_EL1");
1233 cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value);
1234
1235 assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1236
1237 return cluster_id;
1238 }
1239
1240 int
ml_get_max_cpu_number(void)1241 ml_get_max_cpu_number(void)
1242 {
1243 return topology_info.max_cpu_id;
1244 }
1245
1246 int
ml_get_max_cluster_number(void)1247 ml_get_max_cluster_number(void)
1248 {
1249 return topology_info.max_cluster_id;
1250 }
1251
1252 unsigned int
ml_get_first_cpu_id(unsigned int cluster_id)1253 ml_get_first_cpu_id(unsigned int cluster_id)
1254 {
1255 return topology_info.clusters[cluster_id].first_cpu_id;
1256 }
1257
1258 /*
1259 * Return the die id of a cluster.
1260 */
1261 unsigned int
ml_get_die_id(unsigned int cluster_id)1262 ml_get_die_id(unsigned int cluster_id)
1263 {
1264 /*
1265 * The current implementation gets the die_id from the
1266 * first CPU of the cluster.
1267 * rdar://80917654 (Add the die_id field to the cluster topology info)
1268 */
1269 unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1270 return topology_info.cpus[first_cpu].die_id;
1271 }
1272
1273 /*
1274 * Return the index of a cluster in its die.
1275 */
1276 unsigned int
ml_get_die_cluster_id(unsigned int cluster_id)1277 ml_get_die_cluster_id(unsigned int cluster_id)
1278 {
1279 /*
1280 * The current implementation gets the die_id from the
1281 * first CPU of the cluster.
1282 * rdar://80917654 (Add the die_id field to the cluster topology info)
1283 */
1284 unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1285 return topology_info.cpus[first_cpu].die_cluster_id;
1286 }
1287
1288 /*
1289 * Return the highest die id of the system.
1290 */
1291 unsigned int
ml_get_max_die_id(void)1292 ml_get_max_die_id(void)
1293 {
1294 return topology_info.max_die_id;
1295 }
1296
1297 void
ml_lockdown_init()1298 ml_lockdown_init()
1299 {
1300 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
1301 rorgn_stash_range();
1302 #endif
1303 }
1304
1305 kern_return_t
ml_lockdown_handler_register(lockdown_handler_t f,void * this)1306 ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1307 {
1308 if (!f) {
1309 return KERN_FAILURE;
1310 }
1311
1312 assert(lockdown_done);
1313 f(this); // XXX: f this whole function
1314
1315 return KERN_SUCCESS;
1316 }
1317
1318 static mcache_flush_function mcache_flush_func;
1319 static void* mcache_flush_service;
1320 kern_return_t
ml_mcache_flush_callback_register(mcache_flush_function func,void * service)1321 ml_mcache_flush_callback_register(mcache_flush_function func, void *service)
1322 {
1323 mcache_flush_service = service;
1324 mcache_flush_func = func;
1325
1326 return KERN_SUCCESS;
1327 }
1328
1329 kern_return_t
ml_mcache_flush(void)1330 ml_mcache_flush(void)
1331 {
1332 if (!mcache_flush_func) {
1333 panic("Cannot flush M$ with no flush callback registered");
1334
1335 return KERN_FAILURE;
1336 } else {
1337 return mcache_flush_func(mcache_flush_service);
1338 }
1339 }
1340
1341
1342 extern lck_mtx_t pset_create_lock;
1343
1344 kern_return_t
ml_processor_register(ml_processor_info_t * in_processor_info,processor_t * processor_out,ipi_handler_t * ipi_handler_out,perfmon_interrupt_handler_func * pmi_handler_out)1345 ml_processor_register(ml_processor_info_t *in_processor_info,
1346 processor_t *processor_out, ipi_handler_t *ipi_handler_out,
1347 perfmon_interrupt_handler_func *pmi_handler_out)
1348 {
1349 cpu_data_t *this_cpu_datap;
1350 processor_set_t pset;
1351 boolean_t is_boot_cpu;
1352 static unsigned int reg_cpu_count = 0;
1353
1354 if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1355 return KERN_FAILURE;
1356 }
1357
1358 if ((unsigned)OSIncrementAtomic((SInt32*)®_cpu_count) >= topology_info.num_cpus) {
1359 return KERN_FAILURE;
1360 }
1361
1362 if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1363 is_boot_cpu = FALSE;
1364 this_cpu_datap = cpu_data_alloc(FALSE);
1365 cpu_data_init(this_cpu_datap);
1366 } else {
1367 this_cpu_datap = &BootCpuData;
1368 is_boot_cpu = TRUE;
1369 }
1370
1371 assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1372
1373 this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1374
1375 if (!is_boot_cpu) {
1376 this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1377
1378 if (cpu_data_register(this_cpu_datap) != KERN_SUCCESS) {
1379 goto processor_register_error;
1380 }
1381 assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1382 }
1383
1384 this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1385 this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1386 nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency);
1387 this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr);
1388
1389 this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1390 this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1391
1392 this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1393 this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1394 this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1395 this_cpu_datap->cpu_l2_access_penalty = in_processor_info->l2_access_penalty;
1396
1397 this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1398 this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1399 this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1400 this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1401 this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1402 this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1403
1404 #if HAS_CLUSTER
1405 this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1406 #else /* HAS_CLUSTER */
1407 this_cpu_datap->cluster_master = is_boot_cpu;
1408 #endif /* HAS_CLUSTER */
1409 lck_mtx_lock(&pset_create_lock);
1410 pset = pset_find(in_processor_info->cluster_id, NULL);
1411 kprintf("[%d]%s>pset_find(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cluster_id, pset ? pset->pset_id : -1);
1412 if (pset == NULL) {
1413 #if __AMP__
1414 pset_cluster_type_t pset_cluster_type = this_cpu_datap->cpu_cluster_type == CLUSTER_TYPE_E ? PSET_AMP_E : PSET_AMP_P;
1415 pset = pset_create(ml_get_boot_cluster_type() == this_cpu_datap->cpu_cluster_type ? &pset_node0 : &pset_node1, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1416 assert(pset != PROCESSOR_SET_NULL);
1417 kprintf("[%d]%s>pset_create(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, this_cpu_datap->cpu_cluster_id, pset->pset_id);
1418 #else /* __AMP__ */
1419 pset_cluster_type_t pset_cluster_type = PSET_SMP;
1420 pset = pset_create(&pset_node0, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1421 assert(pset != PROCESSOR_SET_NULL);
1422 #endif /* __AMP__ */
1423 }
1424 kprintf("[%d]%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1425 lck_mtx_unlock(&pset_create_lock);
1426
1427 processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1428 if (!is_boot_cpu) {
1429 processor_init(processor, this_cpu_datap->cpu_number, pset);
1430
1431 if (this_cpu_datap->cpu_l2_access_penalty) {
1432 /*
1433 * Cores that have a non-zero L2 access penalty compared
1434 * to the boot processor should be de-prioritized by the
1435 * scheduler, so that threads use the cores with better L2
1436 * preferentially.
1437 */
1438 processor_set_primary(processor, master_processor);
1439 }
1440 }
1441
1442 *processor_out = processor;
1443 *ipi_handler_out = cpu_signal_handler;
1444 #if CPMU_AIC_PMI && MONOTONIC
1445 *pmi_handler_out = mt_cpmu_aic_pmi;
1446 #else
1447 *pmi_handler_out = NULL;
1448 #endif /* CPMU_AIC_PMI && MONOTONIC */
1449 if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1450 *in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1451 }
1452
1453 #if KPC
1454 if (kpc_register_cpu(this_cpu_datap) != TRUE) {
1455 goto processor_register_error;
1456 }
1457 #endif /* KPC */
1458
1459 if (!is_boot_cpu) {
1460 random_cpu_init(this_cpu_datap->cpu_number);
1461 // now let next CPU register itself
1462 OSIncrementAtomic((SInt32*)&real_ncpus);
1463 }
1464
1465 return KERN_SUCCESS;
1466
1467 processor_register_error:
1468 #if KPC
1469 kpc_unregister_cpu(this_cpu_datap);
1470 #endif /* KPC */
1471 if (!is_boot_cpu) {
1472 cpu_data_free(this_cpu_datap);
1473 }
1474
1475 return KERN_FAILURE;
1476 }
1477
1478 void
ml_init_arm_debug_interface(void * in_cpu_datap,vm_offset_t virt_address)1479 ml_init_arm_debug_interface(
1480 void * in_cpu_datap,
1481 vm_offset_t virt_address)
1482 {
1483 ((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1484 do_debugid();
1485 }
1486
1487 /*
1488 * Routine: init_ast_check
1489 * Function:
1490 */
1491 void
init_ast_check(__unused processor_t processor)1492 init_ast_check(
1493 __unused processor_t processor)
1494 {
1495 }
1496
1497 /*
1498 * Routine: cause_ast_check
1499 * Function:
1500 */
1501 void
cause_ast_check(processor_t processor)1502 cause_ast_check(
1503 processor_t processor)
1504 {
1505 if (current_processor() != processor) {
1506 cpu_signal(processor_to_cpu_datap(processor), SIGPast, (void *)NULL, (void *)NULL);
1507 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 1 /* ast */, 0, 0, 0);
1508 }
1509 }
1510
1511 extern uint32_t cpu_idle_count;
1512
1513 void
ml_get_power_state(boolean_t * icp,boolean_t * pidlep)1514 ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
1515 {
1516 *icp = ml_at_interrupt_context();
1517 *pidlep = (cpu_idle_count == real_ncpus);
1518 }
1519
1520 /*
1521 * Routine: ml_cause_interrupt
1522 * Function: Generate a fake interrupt
1523 */
1524 void
ml_cause_interrupt(void)1525 ml_cause_interrupt(void)
1526 {
1527 return; /* BS_XXX */
1528 }
1529
1530 /* Map memory map IO space */
1531 vm_offset_t
ml_io_map(vm_offset_t phys_addr,vm_size_t size)1532 ml_io_map(
1533 vm_offset_t phys_addr,
1534 vm_size_t size)
1535 {
1536 return io_map(phys_addr, size, VM_WIMG_IO, VM_PROT_DEFAULT, false);
1537 }
1538
1539 /* Map memory map IO space (with protections specified) */
1540 vm_offset_t
ml_io_map_with_prot(vm_offset_t phys_addr,vm_size_t size,vm_prot_t prot)1541 ml_io_map_with_prot(
1542 vm_offset_t phys_addr,
1543 vm_size_t size,
1544 vm_prot_t prot)
1545 {
1546 return io_map(phys_addr, size, VM_WIMG_IO, prot, false);
1547 }
1548
1549 vm_offset_t
ml_io_map_unmappable(vm_offset_t phys_addr,vm_size_t size,unsigned int flags)1550 ml_io_map_unmappable(
1551 vm_offset_t phys_addr,
1552 vm_size_t size,
1553 unsigned int flags)
1554 {
1555 return io_map(phys_addr, size, flags, VM_PROT_DEFAULT, true);
1556 }
1557
1558 vm_offset_t
ml_io_map_wcomb(vm_offset_t phys_addr,vm_size_t size)1559 ml_io_map_wcomb(
1560 vm_offset_t phys_addr,
1561 vm_size_t size)
1562 {
1563 return io_map(phys_addr, size, VM_WIMG_WCOMB, VM_PROT_DEFAULT, false);
1564 }
1565
1566 void
ml_io_unmap(vm_offset_t addr,vm_size_t sz)1567 ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1568 {
1569 pmap_remove(kernel_pmap, addr, addr + sz);
1570 kmem_free(kernel_map, addr, sz);
1571 }
1572
1573 vm_map_address_t
ml_map_high_window(vm_offset_t phys_addr,vm_size_t len)1574 ml_map_high_window(
1575 vm_offset_t phys_addr,
1576 vm_size_t len)
1577 {
1578 return pmap_map_high_window_bd(phys_addr, len, VM_PROT_READ | VM_PROT_WRITE);
1579 }
1580
1581 vm_offset_t
ml_static_ptovirt(vm_offset_t paddr)1582 ml_static_ptovirt(
1583 vm_offset_t paddr)
1584 {
1585 return phystokv(paddr);
1586 }
1587
1588 vm_offset_t
ml_static_slide(vm_offset_t vaddr)1589 ml_static_slide(
1590 vm_offset_t vaddr)
1591 {
1592 vm_offset_t slid_vaddr = 0;
1593
1594 {
1595 slid_vaddr = vaddr + vm_kernel_slide;
1596 }
1597
1598 if (!VM_KERNEL_IS_SLID(slid_vaddr)) {
1599 /* This is only intended for use on static kernel addresses. */
1600 return 0;
1601 }
1602
1603 return slid_vaddr;
1604 }
1605
1606 vm_offset_t
ml_static_unslide(vm_offset_t vaddr)1607 ml_static_unslide(
1608 vm_offset_t vaddr)
1609 {
1610 if (!VM_KERNEL_IS_SLID(vaddr)) {
1611 /* This is only intended for use on static kernel addresses. */
1612 return 0;
1613 }
1614
1615
1616 return vaddr - vm_kernel_slide;
1617 }
1618
1619 extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1620
1621 kern_return_t
ml_static_protect(vm_offset_t vaddr,vm_size_t size,vm_prot_t new_prot __unused)1622 ml_static_protect(
1623 vm_offset_t vaddr, /* kernel virtual address */
1624 vm_size_t size,
1625 vm_prot_t new_prot __unused)
1626 {
1627 pt_entry_t arm_prot = 0;
1628 pt_entry_t arm_block_prot = 0;
1629 vm_offset_t vaddr_cur;
1630 ppnum_t ppn;
1631 kern_return_t result = KERN_SUCCESS;
1632
1633 if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1634 panic("ml_static_protect(): %p < %p", (void *) vaddr, (void *) VM_MIN_KERNEL_ADDRESS);
1635 return KERN_FAILURE;
1636 }
1637
1638 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1639
1640 if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1641 panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1642 }
1643 if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1644 panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1645 }
1646
1647 /* Set up the protection bits, and block bits so we can validate block mappings. */
1648 if (new_prot & VM_PROT_WRITE) {
1649 arm_prot |= ARM_PTE_AP(AP_RWNA);
1650 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RWNA);
1651 } else {
1652 arm_prot |= ARM_PTE_AP(AP_RONA);
1653 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RONA);
1654 }
1655
1656 arm_prot |= ARM_PTE_NX;
1657 arm_block_prot |= ARM_TTE_BLOCK_NX;
1658
1659 if (!(new_prot & VM_PROT_EXECUTE)) {
1660 arm_prot |= ARM_PTE_PNX;
1661 arm_block_prot |= ARM_TTE_BLOCK_PNX;
1662 }
1663
1664 for (vaddr_cur = vaddr;
1665 vaddr_cur < trunc_page_64(vaddr + size);
1666 vaddr_cur += PAGE_SIZE) {
1667 ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1668 if (ppn != (vm_offset_t) NULL) {
1669 tt_entry_t *tte2;
1670 pt_entry_t *pte_p;
1671 pt_entry_t ptmp;
1672
1673 #if XNU_MONITOR
1674 assert(!pmap_is_monitor(ppn));
1675 assert(!TEST_PAGE_RATIO_4);
1676 #endif
1677
1678 tte2 = arm_kva_to_tte(vaddr_cur);
1679
1680 if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1681 if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1682 ((*tte2 & (ARM_TTE_BLOCK_NXMASK | ARM_TTE_BLOCK_PNXMASK | ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1683 /*
1684 * We can support ml_static_protect on a block mapping if the mapping already has
1685 * the desired protections. We still want to run checks on a per-page basis.
1686 */
1687 continue;
1688 }
1689
1690 result = KERN_FAILURE;
1691 break;
1692 }
1693
1694 pte_p = (pt_entry_t *)&((tt_entry_t*)(phystokv((*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1695 ptmp = *pte_p;
1696
1697 if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot)) {
1698 /*
1699 * The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
1700 * protections do not match the desired protections, then we will fail (as we cannot update
1701 * this mapping without updating other mappings as well).
1702 */
1703 result = KERN_FAILURE;
1704 break;
1705 }
1706
1707 __unreachable_ok_push
1708 if (TEST_PAGE_RATIO_4) {
1709 {
1710 unsigned int i;
1711 pt_entry_t *ptep_iter;
1712
1713 ptep_iter = pte_p;
1714 for (i = 0; i < 4; i++, ptep_iter++) {
1715 /* Note that there is a hole in the HINT sanity checking here. */
1716 ptmp = *ptep_iter;
1717
1718 /* We only need to update the page tables if the protections do not match. */
1719 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1720 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1721 *ptep_iter = ptmp;
1722 }
1723 }
1724 }
1725 } else {
1726 ptmp = *pte_p;
1727 /* We only need to update the page tables if the protections do not match. */
1728 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1729 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1730 *pte_p = ptmp;
1731 }
1732 }
1733 __unreachable_ok_pop
1734 }
1735 }
1736
1737 if (vaddr_cur > vaddr) {
1738 assert(((vaddr_cur - vaddr) & 0xFFFFFFFF00000000ULL) == 0);
1739 flush_mmu_tlb_region(vaddr, (uint32_t)(vaddr_cur - vaddr));
1740 }
1741
1742
1743 return result;
1744 }
1745
1746
1747 /*
1748 * Routine: ml_static_mfree
1749 * Function:
1750 */
1751 void
ml_static_mfree(vm_offset_t vaddr,vm_size_t size)1752 ml_static_mfree(
1753 vm_offset_t vaddr,
1754 vm_size_t size)
1755 {
1756 vm_offset_t vaddr_cur;
1757 vm_offset_t paddr_cur;
1758 ppnum_t ppn;
1759 uint32_t freed_pages = 0;
1760 uint32_t freed_kernelcache_pages = 0;
1761
1762
1763 /* It is acceptable (if bad) to fail to free. */
1764 if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1765 return;
1766 }
1767
1768 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1769
1770 for (vaddr_cur = vaddr;
1771 vaddr_cur < trunc_page_64(vaddr + size);
1772 vaddr_cur += PAGE_SIZE) {
1773 ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1774 if (ppn != (vm_offset_t) NULL) {
1775 /*
1776 * It is not acceptable to fail to update the protections on a page
1777 * we will release to the VM. We need to either panic or continue.
1778 * For now, we'll panic (to help flag if there is memory we can
1779 * reclaim).
1780 */
1781 if (ml_static_protect(vaddr_cur, PAGE_SIZE, VM_PROT_WRITE | VM_PROT_READ) != KERN_SUCCESS) {
1782 panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
1783 }
1784
1785 paddr_cur = ptoa(ppn);
1786
1787
1788 vm_page_create(ppn, (ppn + 1));
1789 freed_pages++;
1790 if (paddr_cur >= arm_vm_kernelcache_phys_start && paddr_cur < arm_vm_kernelcache_phys_end) {
1791 freed_kernelcache_pages++;
1792 }
1793 }
1794 }
1795 vm_page_lockspin_queues();
1796 vm_page_wire_count -= freed_pages;
1797 vm_page_wire_count_initial -= freed_pages;
1798 vm_page_kernelcache_count -= freed_kernelcache_pages;
1799 vm_page_unlock_queues();
1800 #if DEBUG
1801 kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
1802 #endif
1803 }
1804
1805 /*
1806 * Routine: ml_page_protection_type
1807 * Function: Returns the type of page protection that the system supports.
1808 */
1809 ml_page_protection_t
ml_page_protection_type(void)1810 ml_page_protection_type(void)
1811 {
1812 #if XNU_MONITOR
1813 return 1;
1814 #else
1815 return 0;
1816 #endif
1817 }
1818
1819 /* virtual to physical on wired pages */
1820 vm_offset_t
ml_vtophys(vm_offset_t vaddr)1821 ml_vtophys(vm_offset_t vaddr)
1822 {
1823 return kvtophys(vaddr);
1824 }
1825
1826 /*
1827 * Routine: ml_nofault_copy
1828 * Function: Perform a physical mode copy if the source and destination have
1829 * valid translations in the kernel pmap. If translations are present, they are
1830 * assumed to be wired; e.g., no attempt is made to guarantee that the
1831 * translations obtained remain valid for the duration of the copy process.
1832 */
1833 vm_size_t
ml_nofault_copy(vm_offset_t virtsrc,vm_offset_t virtdst,vm_size_t size)1834 ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
1835 {
1836 addr64_t cur_phys_dst, cur_phys_src;
1837 vm_size_t count, nbytes = 0;
1838
1839 while (size > 0) {
1840 if (!(cur_phys_src = kvtophys(virtsrc))) {
1841 break;
1842 }
1843 if (!(cur_phys_dst = kvtophys(virtdst))) {
1844 break;
1845 }
1846 if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) ||
1847 !pmap_valid_address(trunc_page_64(cur_phys_src))) {
1848 break;
1849 }
1850 count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
1851 if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
1852 count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
1853 }
1854 if (count > size) {
1855 count = size;
1856 }
1857
1858 bcopy_phys(cur_phys_src, cur_phys_dst, count);
1859
1860 nbytes += count;
1861 virtsrc += count;
1862 virtdst += count;
1863 size -= count;
1864 }
1865
1866 return nbytes;
1867 }
1868
1869 /*
1870 * Routine: ml_validate_nofault
1871 * Function: Validate that ths address range has a valid translations
1872 * in the kernel pmap. If translations are present, they are
1873 * assumed to be wired; i.e. no attempt is made to guarantee
1874 * that the translation persist after the check.
1875 * Returns: TRUE if the range is mapped and will not cause a fault,
1876 * FALSE otherwise.
1877 */
1878
1879 boolean_t
ml_validate_nofault(vm_offset_t virtsrc,vm_size_t size)1880 ml_validate_nofault(
1881 vm_offset_t virtsrc, vm_size_t size)
1882 {
1883 addr64_t cur_phys_src;
1884 uint32_t count;
1885
1886 while (size > 0) {
1887 if (!(cur_phys_src = kvtophys(virtsrc))) {
1888 return FALSE;
1889 }
1890 if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
1891 return FALSE;
1892 }
1893 count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
1894 if (count > size) {
1895 count = (uint32_t)size;
1896 }
1897
1898 virtsrc += count;
1899 size -= count;
1900 }
1901
1902 return TRUE;
1903 }
1904
1905 void
ml_get_bouncepool_info(vm_offset_t * phys_addr,vm_size_t * size)1906 ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
1907 {
1908 *phys_addr = 0;
1909 *size = 0;
1910 }
1911
1912 void
active_rt_threads(__unused boolean_t active)1913 active_rt_threads(__unused boolean_t active)
1914 {
1915 }
1916
1917 static void
cpu_qos_cb_default(__unused int urgency,__unused uint64_t qos_param1,__unused uint64_t qos_param2)1918 cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
1919 {
1920 return;
1921 }
1922
1923 cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
1924
1925 void
cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)1926 cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
1927 {
1928 if (cpu_qos_cb != NULL) {
1929 cpu_qos_update = cpu_qos_cb;
1930 } else {
1931 cpu_qos_update = cpu_qos_cb_default;
1932 }
1933 }
1934
1935 void
thread_tell_urgency(thread_urgency_t urgency,uint64_t rt_period,uint64_t rt_deadline,uint64_t sched_latency __unused,__unused thread_t nthread)1936 thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
1937 {
1938 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
1939
1940 cpu_qos_update((int)urgency, rt_period, rt_deadline);
1941
1942 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
1943 }
1944
1945 void
machine_run_count(__unused uint32_t count)1946 machine_run_count(__unused uint32_t count)
1947 {
1948 }
1949
1950 processor_t
machine_choose_processor(__unused processor_set_t pset,processor_t processor)1951 machine_choose_processor(__unused processor_set_t pset, processor_t processor)
1952 {
1953 return processor;
1954 }
1955
1956 #if KASAN
1957 vm_offset_t ml_stack_base(void);
1958 vm_size_t ml_stack_size(void);
1959
1960 vm_offset_t
ml_stack_base(void)1961 ml_stack_base(void)
1962 {
1963 uintptr_t local = (uintptr_t) &local;
1964 vm_offset_t intstack_top_ptr;
1965
1966 intstack_top_ptr = getCpuDatap()->intstack_top;
1967 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1968 return intstack_top_ptr - INTSTACK_SIZE;
1969 } else {
1970 return current_thread()->kernel_stack;
1971 }
1972 }
1973 vm_size_t
ml_stack_size(void)1974 ml_stack_size(void)
1975 {
1976 uintptr_t local = (uintptr_t) &local;
1977 vm_offset_t intstack_top_ptr;
1978
1979 intstack_top_ptr = getCpuDatap()->intstack_top;
1980 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1981 return INTSTACK_SIZE;
1982 } else {
1983 return kernel_stack_size;
1984 }
1985 }
1986 #endif
1987
1988 #ifdef CONFIG_KCOV
1989
1990 kcov_cpu_data_t *
current_kcov_data(void)1991 current_kcov_data(void)
1992 {
1993 return ¤t_cpu_datap()->cpu_kcov_data;
1994 }
1995
1996 kcov_cpu_data_t *
cpu_kcov_data(int cpuid)1997 cpu_kcov_data(int cpuid)
1998 {
1999 return &cpu_datap(cpuid)->cpu_kcov_data;
2000 }
2001
2002 #endif /* CONFIG_KCOV */
2003
2004 boolean_t
machine_timeout_suspended(void)2005 machine_timeout_suspended(void)
2006 {
2007 return FALSE;
2008 }
2009
2010 kern_return_t
ml_interrupt_prewarm(__unused uint64_t deadline)2011 ml_interrupt_prewarm(__unused uint64_t deadline)
2012 {
2013 return KERN_FAILURE;
2014 }
2015
2016 /*
2017 * Assumes fiq, irq disabled.
2018 */
2019 void
ml_set_decrementer(uint32_t dec_value)2020 ml_set_decrementer(uint32_t dec_value)
2021 {
2022 cpu_data_t *cdp = getCpuDatap();
2023
2024 assert(ml_get_interrupts_enabled() == FALSE);
2025 cdp->cpu_decrementer = dec_value;
2026
2027 if (cdp->cpu_set_decrementer_func) {
2028 cdp->cpu_set_decrementer_func(dec_value);
2029 } else {
2030 __builtin_arm_wsr64("CNTV_TVAL_EL0", (uint64_t)dec_value);
2031 }
2032 }
2033
2034 /**
2035 * Reads from a non-speculative view of the timebase. If no such view exists on
2036 * this CPU, then an ISB is used to prevent speculation instead.
2037 *
2038 * @return the current value of the hardware timebase
2039 */
2040 static inline uint64_t
nonspeculative_timebase(void)2041 nonspeculative_timebase(void)
2042 {
2043 #if defined(HAS_ACNTVCT)
2044 return __builtin_arm_rsr64("ACNTVCT_EL0");
2045 #elif __ARM_ARCH_8_6__
2046 return __builtin_arm_rsr64("CNTVCTSS_EL0");
2047 #else
2048 // ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
2049 // "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
2050 // to other instructions executed on the same processor."
2051 __builtin_arm_isb(ISB_SY);
2052 return __builtin_arm_rsr64("CNTVCT_EL0");
2053 #endif
2054 }
2055
2056
2057 uint64_t
ml_get_hwclock()2058 ml_get_hwclock()
2059 {
2060 uint64_t timebase = nonspeculative_timebase();
2061 return timebase;
2062 }
2063
2064 uint64_t
ml_get_timebase()2065 ml_get_timebase()
2066 {
2067 uint64_t clock, timebase;
2068
2069 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
2070 do {
2071 timebase = getCpuDatap()->cpu_base_timebase;
2072 os_compiler_barrier();
2073 clock = ml_get_hwclock();
2074 os_compiler_barrier();
2075 } while (getCpuDatap()->cpu_base_timebase != timebase);
2076
2077 return clock + timebase;
2078 }
2079
2080 /**
2081 * Issue a barrier that guarantees all prior memory accesses will complete
2082 * before any subsequent timebase reads.
2083 */
2084 void
ml_memory_to_timebase_fence(void)2085 ml_memory_to_timebase_fence(void)
2086 {
2087 __builtin_arm_dmb(DMB_SY);
2088 const uint64_t take_backwards_branch = 0;
2089 asm volatile (
2090 "1:"
2091 "ldr x0, [%[take_backwards_branch]]" "\n"
2092 "cbnz x0, 1b" "\n"
2093 :
2094 : [take_backwards_branch] "r"(&take_backwards_branch)
2095 : "x0"
2096 );
2097
2098 /* throwaway read to prevent ml_get_speculative_timebase() reordering */
2099 (void)ml_get_hwclock();
2100 }
2101
2102 /**
2103 * Issue a barrier that guarantees all prior timebase reads will
2104 * be ordered before any subsequent memory accesses.
2105 */
2106 void
ml_timebase_to_memory_fence(void)2107 ml_timebase_to_memory_fence(void)
2108 {
2109 __builtin_arm_isb(ISB_SY);
2110 }
2111
2112 /*
2113 * Get the speculative timebase without an ISB.
2114 */
2115 uint64_t
ml_get_speculative_timebase(void)2116 ml_get_speculative_timebase(void)
2117 {
2118 uint64_t clock, timebase;
2119
2120 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
2121 do {
2122 timebase = getCpuDatap()->cpu_base_timebase;
2123 os_compiler_barrier();
2124 clock = __builtin_arm_rsr64("CNTVCT_EL0");
2125
2126 os_compiler_barrier();
2127 } while (getCpuDatap()->cpu_base_timebase != timebase);
2128
2129 return clock + timebase;
2130 }
2131
2132 uint64_t
ml_get_timebase_entropy(void)2133 ml_get_timebase_entropy(void)
2134 {
2135 return ml_get_speculative_timebase();
2136 }
2137
2138 uint32_t
ml_get_decrementer(void)2139 ml_get_decrementer(void)
2140 {
2141 cpu_data_t *cdp = getCpuDatap();
2142 uint32_t dec;
2143
2144 assert(ml_get_interrupts_enabled() == FALSE);
2145
2146 if (cdp->cpu_get_decrementer_func) {
2147 dec = cdp->cpu_get_decrementer_func();
2148 } else {
2149 uint64_t wide_val;
2150
2151 wide_val = __builtin_arm_rsr64("CNTV_TVAL_EL0");
2152 dec = (uint32_t)wide_val;
2153 assert(wide_val == (uint64_t)dec);
2154 }
2155
2156 return dec;
2157 }
2158
2159 boolean_t
ml_get_timer_pending(void)2160 ml_get_timer_pending(void)
2161 {
2162 uint64_t cntv_ctl = __builtin_arm_rsr64("CNTV_CTL_EL0");
2163 return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
2164 }
2165
2166 __attribute__((noreturn))
2167 void
platform_syscall(arm_saved_state_t * state)2168 platform_syscall(arm_saved_state_t *state)
2169 {
2170 uint32_t code;
2171
2172 #define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2173
2174 code = (uint32_t)get_saved_state_reg(state, 3);
2175
2176 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_START,
2177 get_saved_state_reg(state, 0),
2178 get_saved_state_reg(state, 1),
2179 get_saved_state_reg(state, 2));
2180
2181 switch (code) {
2182 case 2:
2183 /* set cthread */
2184 platform_syscall_kprintf("set cthread self.\n");
2185 thread_set_cthread_self(get_saved_state_reg(state, 0));
2186 break;
2187 case 3:
2188 /* get cthread */
2189 platform_syscall_kprintf("get cthread self.\n");
2190 set_saved_state_reg(state, 0, thread_get_cthread_self());
2191 break;
2192 case 0: /* I-Cache flush (removed) */
2193 case 1: /* D-Cache flush (removed) */
2194 default:
2195 platform_syscall_kprintf("unknown: %d\n", code);
2196 break;
2197 }
2198
2199 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_END,
2200 get_saved_state_reg(state, 0));
2201
2202 thread_exception_return();
2203 }
2204
2205 static void
_enable_timebase_event_stream(uint32_t bit_index)2206 _enable_timebase_event_stream(uint32_t bit_index)
2207 {
2208 uint64_t cntkctl; /* One wants to use 32 bits, but "mrs" prefers it this way */
2209
2210 if (bit_index >= 64) {
2211 panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2212 }
2213
2214 __asm__ volatile ("mrs %0, CNTKCTL_EL1" : "=r"(cntkctl));
2215
2216 cntkctl |= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2217 cntkctl |= CNTKCTL_EL1_EVNTEN;
2218 cntkctl |= CNTKCTL_EL1_EVENTDIR; /* 1->0; why not? */
2219
2220 /*
2221 * If the SOC supports it (and it isn't broken), enable
2222 * EL0 access to the timebase registers.
2223 */
2224 if (user_timebase_type() != USER_TIMEBASE_NONE) {
2225 cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2226 }
2227
2228 __builtin_arm_wsr64("CNTKCTL_EL1", cntkctl);
2229 }
2230
2231 /*
2232 * Turn timer on, unmask that interrupt.
2233 */
2234 static void
_enable_virtual_timer(void)2235 _enable_virtual_timer(void)
2236 {
2237 uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
2238
2239 __builtin_arm_wsr64("CNTV_CTL_EL0", cntvctl);
2240 /* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
2241 __builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2242 }
2243
2244 void
fiq_context_init(boolean_t enable_fiq __unused)2245 fiq_context_init(boolean_t enable_fiq __unused)
2246 {
2247 /* Interrupts still disabled. */
2248 assert(ml_get_interrupts_enabled() == FALSE);
2249 _enable_virtual_timer();
2250 }
2251
2252 void
wfe_timeout_init(void)2253 wfe_timeout_init(void)
2254 {
2255 _enable_timebase_event_stream(arm64_eventi);
2256 }
2257
2258 /**
2259 * Configures, but does not enable, the WFE event stream. The event stream
2260 * generates an event at a set interval to act as a timeout for WFEs.
2261 *
2262 * This function sets the static global variable arm64_eventi to be the proper
2263 * bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2264 * period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2265 * is used by wfe_timeout_init to actually poke the registers and enable the
2266 * event stream.
2267 *
2268 * The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2269 * is the trigger for the system to generate an event. The trigger can occur on
2270 * either the rising or falling edge of the bit depending on the value of
2271 * CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2272 * falling edge (1->0) transition to generate events.
2273 */
2274 void
wfe_timeout_configure(void)2275 wfe_timeout_configure(void)
2276 {
2277 /* Could fill in our own ops here, if we needed them */
2278 uint64_t ticks_per_sec, ticks_per_event, events_per_sec = 0;
2279 uint32_t bit_index;
2280
2281 if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
2282 if (events_per_sec <= 0) {
2283 events_per_sec = 1;
2284 } else if (events_per_sec > USEC_PER_SEC) {
2285 events_per_sec = USEC_PER_SEC;
2286 }
2287 } else {
2288 events_per_sec = USEC_PER_SEC;
2289 }
2290 ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2291 ticks_per_event = ticks_per_sec / events_per_sec;
2292
2293 /* Bit index of next power of two greater than ticks_per_event */
2294 bit_index = flsll(ticks_per_event) - 1;
2295 /* Round up to next power of two if ticks_per_event is initially power of two */
2296 if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
2297 bit_index++;
2298 }
2299
2300 /*
2301 * The timer can only trigger on rising or falling edge, not both; we don't
2302 * care which we trigger on, but we do need to adjust which bit we are
2303 * interested in to account for this.
2304 *
2305 * In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2306 * falling edge of the given bit. Therefore, we must decrement the bit index
2307 * by one as when the bit before the one we care about makes a 1 -> 0
2308 * transition, the bit we care about makes a 0 -> 1 transition.
2309 *
2310 * For example if we want an event generated every 8 ticks (if we calculated
2311 * a bit_index of 3), we would want the event to be generated whenever the
2312 * lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2313 * see that the bit at index 2 makes a falling transition in this scenario,
2314 * so we would want EVENTI to be 2 instead of 3.
2315 */
2316 if (bit_index != 0) {
2317 bit_index--;
2318 }
2319
2320 arm64_eventi = bit_index;
2321 }
2322
2323 boolean_t
ml_delay_should_spin(uint64_t interval)2324 ml_delay_should_spin(uint64_t interval)
2325 {
2326 cpu_data_t *cdp = getCpuDatap();
2327
2328 if (cdp->cpu_idle_latency) {
2329 return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2330 } else {
2331 /*
2332 * Early boot, latency is unknown. Err on the side of blocking,
2333 * which should always be safe, even if slow
2334 */
2335 return FALSE;
2336 }
2337 }
2338
2339 boolean_t
ml_thread_is64bit(thread_t thread)2340 ml_thread_is64bit(thread_t thread)
2341 {
2342 return thread_is_64bit_addr(thread);
2343 }
2344
2345 void
ml_delay_on_yield(void)2346 ml_delay_on_yield(void)
2347 {
2348 #if DEVELOPMENT || DEBUG
2349 if (yield_delay_us) {
2350 delay(yield_delay_us);
2351 }
2352 #endif
2353 }
2354
2355 void
ml_timer_evaluate(void)2356 ml_timer_evaluate(void)
2357 {
2358 }
2359
2360 boolean_t
ml_timer_forced_evaluation(void)2361 ml_timer_forced_evaluation(void)
2362 {
2363 return FALSE;
2364 }
2365
2366 void
ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)2367 ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2368 {
2369 /*
2370 * For now: update the resource coalition stats of the
2371 * current thread's coalition
2372 */
2373 task_coalition_update_gpu_stats(current_task(), gpu_ns_delta);
2374 }
2375
2376 uint64_t
ml_gpu_stat(__unused thread_t t)2377 ml_gpu_stat(__unused thread_t t)
2378 {
2379 return 0;
2380 }
2381
2382 thread_t
current_thread(void)2383 current_thread(void)
2384 {
2385 return current_thread_fast();
2386 }
2387
2388 typedef struct{
2389 ex_cb_t cb;
2390 void *refcon;
2391 }
2392 ex_cb_info_t;
2393
2394 ex_cb_info_t ex_cb_info[EXCB_CLASS_MAX];
2395
2396 /*
2397 * Callback registration
2398 * Currently we support only one registered callback per class but
2399 * it should be possible to support more callbacks
2400 */
2401 kern_return_t
ex_cb_register(ex_cb_class_t cb_class,ex_cb_t cb,void * refcon)2402 ex_cb_register(
2403 ex_cb_class_t cb_class,
2404 ex_cb_t cb,
2405 void *refcon)
2406 {
2407 ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2408
2409 if ((NULL == cb) || (cb_class >= EXCB_CLASS_MAX)) {
2410 return KERN_INVALID_VALUE;
2411 }
2412
2413 if (NULL == pInfo->cb) {
2414 pInfo->cb = cb;
2415 pInfo->refcon = refcon;
2416 return KERN_SUCCESS;
2417 }
2418 return KERN_FAILURE;
2419 }
2420
2421 /*
2422 * Called internally by platform kernel to invoke the registered callback for class
2423 */
2424 ex_cb_action_t
ex_cb_invoke(ex_cb_class_t cb_class,vm_offset_t far)2425 ex_cb_invoke(
2426 ex_cb_class_t cb_class,
2427 vm_offset_t far)
2428 {
2429 ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2430 ex_cb_state_t state = {far};
2431
2432 if (cb_class >= EXCB_CLASS_MAX) {
2433 panic("Invalid exception callback class 0x%x", cb_class);
2434 }
2435
2436 if (pInfo->cb) {
2437 return pInfo->cb(cb_class, pInfo->refcon, &state);
2438 }
2439 return EXCB_ACTION_NONE;
2440 }
2441
2442 #if defined(HAS_APPLE_PAC)
2443 void
ml_task_set_disable_user_jop(task_t task,uint8_t disable_user_jop)2444 ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2445 {
2446 assert(task);
2447 task->disable_user_jop = disable_user_jop;
2448 }
2449
2450 void
ml_thread_set_disable_user_jop(thread_t thread,uint8_t disable_user_jop)2451 ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2452 {
2453 assert(thread);
2454 if (disable_user_jop) {
2455 thread->machine.arm_machine_flags |= ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2456 } else {
2457 thread->machine.arm_machine_flags &= ~ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2458 }
2459 }
2460
2461 void
ml_task_set_rop_pid(task_t task,task_t parent_task,boolean_t inherit)2462 ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2463 {
2464 if (inherit) {
2465 task->rop_pid = parent_task->rop_pid;
2466 } else {
2467 task->rop_pid = early_random();
2468 }
2469 }
2470
2471 /**
2472 * jop_pid may be inherited from the parent task or generated inside the shared
2473 * region. Unfortunately these two parameters are available at very different
2474 * times during task creation, so we need to split this into two steps.
2475 */
2476 void
ml_task_set_jop_pid(task_t task,task_t parent_task,boolean_t inherit)2477 ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit)
2478 {
2479 if (inherit) {
2480 task->jop_pid = parent_task->jop_pid;
2481 } else {
2482 task->jop_pid = ml_default_jop_pid();
2483 }
2484 }
2485
2486 void
ml_task_set_jop_pid_from_shared_region(task_t task)2487 ml_task_set_jop_pid_from_shared_region(task_t task)
2488 {
2489 vm_shared_region_t sr = vm_shared_region_get(task);
2490 /*
2491 * If there's no shared region, we can assign the key arbitrarily. This
2492 * typically happens when Mach-O image activation failed part of the way
2493 * through, and this task is in the middle of dying with SIGKILL anyway.
2494 */
2495 if (__improbable(!sr)) {
2496 task->jop_pid = early_random();
2497 return;
2498 }
2499 vm_shared_region_deallocate(sr);
2500
2501 /*
2502 * Similarly we have to worry about jetsam having killed the task and
2503 * already cleared the shared_region_id.
2504 */
2505 task_lock(task);
2506 if (task->shared_region_id != NULL) {
2507 task->jop_pid = shared_region_find_key(task->shared_region_id);
2508 } else {
2509 task->jop_pid = early_random();
2510 }
2511 task_unlock(task);
2512 }
2513
2514 void
ml_thread_set_jop_pid(thread_t thread,task_t task)2515 ml_thread_set_jop_pid(thread_t thread, task_t task)
2516 {
2517 thread->machine.jop_pid = task->jop_pid;
2518 }
2519 #endif /* defined(HAS_APPLE_PAC) */
2520
2521 #if DEVELOPMENT || DEBUG
2522 static uint64_t minor_badness_suffered = 0;
2523 #endif
2524 void
ml_report_minor_badness(uint32_t __unused badness_id)2525 ml_report_minor_badness(uint32_t __unused badness_id)
2526 {
2527 #if DEVELOPMENT || DEBUG
2528 (void)os_atomic_or(&minor_badness_suffered, 1ULL << badness_id, relaxed);
2529 #endif
2530 }
2531
2532 #if defined(HAS_APPLE_PAC)
2533 #if __ARM_ARCH_8_6__ || APPLEVIRTUALPLATFORM
2534 /**
2535 * The ARMv8.6 implementation is also safe for non-FPAC CPUs, but less efficient;
2536 * guest kernels need to use it because it does not know at compile time whether
2537 * the host CPU supports FPAC.
2538 */
2539
2540 /**
2541 * Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2542 */
2543 static void *
ml_poison_ptr(void * ptr,ptrauth_key key)2544 ml_poison_ptr(void *ptr, ptrauth_key key)
2545 {
2546 bool b_key = key & (1ULL << 0);
2547 uint64_t error_code;
2548 if (b_key) {
2549 error_code = 2;
2550 } else {
2551 error_code = 1;
2552 }
2553
2554 bool kernel_pointer = (uintptr_t)ptr & (1ULL << 55);
2555 bool data_key = key & (1ULL << 1);
2556 /* When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters */
2557 bool tbi = data_key && !kernel_pointer;
2558 unsigned int poison_shift;
2559 if (tbi) {
2560 poison_shift = 53;
2561 } else {
2562 poison_shift = 61;
2563 }
2564
2565 uintptr_t poisoned = (uintptr_t)ptr;
2566 poisoned &= ~(3ULL << poison_shift);
2567 poisoned |= error_code << poison_shift;
2568 return (void *)poisoned;
2569 }
2570
2571 /*
2572 * ptrauth_sign_unauthenticated() reimplemented using asm volatile, forcing the
2573 * compiler to assume this operation has side-effects and cannot be reordered
2574 */
2575 #define ptrauth_sign_volatile(__value, __suffix, __data) \
2576 ({ \
2577 void *__ret = __value; \
2578 asm volatile ( \
2579 "pac" #__suffix " %[value], %[data]" \
2580 : [value] "+r"(__ret) \
2581 : [data] "r"(__data) \
2582 ); \
2583 __ret; \
2584 })
2585
2586 #define ml_auth_ptr_unchecked_for_key(_ptr, _suffix, _key, _modifier) \
2587 do { \
2588 void *stripped = ptrauth_strip(_ptr, _key); \
2589 void *reauthed = ptrauth_sign_volatile(stripped, _suffix, _modifier); \
2590 if (__probable(_ptr == reauthed)) { \
2591 _ptr = stripped; \
2592 } else { \
2593 _ptr = ml_poison_ptr(stripped, _key); \
2594 } \
2595 } while (0)
2596
2597 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2598 ml_auth_ptr_unchecked_for_key(_ptr, _suffix, ptrauth_key_as ## _suffix, _modifier)
2599 #else
2600 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2601 asm volatile ("aut" #_suffix " %[ptr], %[modifier]" : [ptr] "+r"(_ptr) : [modifier] "r"(_modifier));
2602 #endif /* __ARM_ARCH_8_6__ || APPLEVIRTUALPLATFORM */
2603
2604 /**
2605 * Authenticates a signed pointer without trapping on failure.
2606 *
2607 * @warning This function must be called with interrupts disabled.
2608 *
2609 * @warning Pointer authentication failure should normally be treated as a fatal
2610 * error. This function is intended for a handful of callers that cannot panic
2611 * on failure, and that understand the risks in handling a poisoned return
2612 * value. Other code should generally use the trapping variant
2613 * ptrauth_auth_data() instead.
2614 *
2615 * @param ptr the pointer to authenticate
2616 * @param key which key to use for authentication
2617 * @param modifier a modifier to mix into the key
2618 * @return an authenticated version of ptr, possibly with poison bits set
2619 */
2620 void *
ml_auth_ptr_unchecked(void * ptr,ptrauth_key key,uint64_t modifier)2621 ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier)
2622 {
2623 switch (key & 0x3) {
2624 case ptrauth_key_asia:
2625 _ml_auth_ptr_unchecked(ptr, ia, modifier);
2626 break;
2627 case ptrauth_key_asib:
2628 _ml_auth_ptr_unchecked(ptr, ib, modifier);
2629 break;
2630 case ptrauth_key_asda:
2631 _ml_auth_ptr_unchecked(ptr, da, modifier);
2632 break;
2633 case ptrauth_key_asdb:
2634 _ml_auth_ptr_unchecked(ptr, db, modifier);
2635 break;
2636 }
2637
2638 return ptr;
2639 }
2640 #endif /* defined(HAS_APPLE_PAC) */
2641
2642 #ifdef CONFIG_XNUPOST
2643 void
ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_addr)2644 ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2645 {
2646 thread_t thread = current_thread();
2647 thread->machine.expected_fault_handler = expected_fault_handler;
2648 thread->machine.expected_fault_addr = expected_fault_addr;
2649 }
2650
2651 void
ml_expect_fault_end(void)2652 ml_expect_fault_end(void)
2653 {
2654 thread_t thread = current_thread();
2655 thread->machine.expected_fault_handler = NULL;
2656 thread->machine.expected_fault_addr = 0;
2657 }
2658 #endif /* CONFIG_XNUPOST */
2659
2660 void
ml_hibernate_active_pre(void)2661 ml_hibernate_active_pre(void)
2662 {
2663 #if HIBERNATION
2664 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2665
2666 hibernate_rebuild_vm_structs();
2667 }
2668 #endif /* HIBERNATION */
2669 }
2670
2671 void
ml_hibernate_active_post(void)2672 ml_hibernate_active_post(void)
2673 {
2674 #if HIBERNATION
2675 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2676 hibernate_machine_init();
2677 hibernate_vm_lock_end();
2678 current_cpu_datap()->cpu_hibernate = 0;
2679 }
2680 #endif /* HIBERNATION */
2681 }
2682
2683 /**
2684 * Return back a machine-dependent array of address space regions that should be
2685 * reserved by the VM (pre-mapped in the address space). This will prevent user
2686 * processes from allocating or deallocating from within these regions.
2687 *
2688 * @param vm_is64bit True if the process has a 64-bit address space.
2689 * @param regions An out parameter representing an array of regions to reserve.
2690 *
2691 * @return The number of reserved regions returned through `regions`.
2692 */
2693 size_t
ml_get_vm_reserved_regions(bool vm_is64bit,const struct vm_reserved_region ** regions)2694 ml_get_vm_reserved_regions(bool vm_is64bit, const struct vm_reserved_region **regions)
2695 {
2696 assert(regions != NULL);
2697
2698 /**
2699 * Reserved regions only apply to 64-bit address spaces. This is because
2700 * we only expect to grow the maximum user VA address on 64-bit address spaces
2701 * (we've essentially already reached the max for 32-bit spaces). The reserved
2702 * regions should safely fall outside of the max user VA for 32-bit processes.
2703 */
2704 if (vm_is64bit) {
2705 *regions = vm_reserved_regions;
2706 return ARRAY_COUNT(vm_reserved_regions);
2707 } else {
2708 /* Don't reserve any VA regions on arm64_32 processes. */
2709 *regions = NULL;
2710 return 0;
2711 }
2712 }
2713
2714 /* These WFE recommendations are expected to be updated on a relatively
2715 * infrequent cadence, possibly from a different cluster, hence
2716 * false cacheline sharing isn't expected to be material
2717 */
2718 static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
2719
2720 uint32_t
ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id,uint64_t wfe_timeout_abstime_interval,__unused uint64_t wfe_hint_flags)2721 ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
2722 {
2723 assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
2724 assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
2725 os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
2726 return 0; /* Success */
2727 }
2728
2729 #if DEVELOPMENT || DEBUG
2730 int wfe_rec_max = 0;
2731 int wfe_rec_none = 0;
2732 uint64_t wfe_rec_override_mat = 0;
2733 uint64_t wfe_rec_clamp = 0;
2734 #endif
2735
2736 uint64_t
ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)2737 ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
2738 {
2739 /* This and its consumer does not synchronize vis-a-vis updates
2740 * of the recommendation; races are acceptable.
2741 */
2742 uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
2743 #if DEVELOPMENT || DEBUG
2744 if (wfe_rec_clamp) {
2745 wfet = MIN(wfe_rec_clamp, wfet);
2746 }
2747
2748 if (wfe_rec_max) {
2749 for (int i = 0; i < MAX_CPU_CLUSTERS; i++) {
2750 if (arm64_cluster_wfe_recs[i] > wfet) {
2751 wfet = arm64_cluster_wfe_recs[i];
2752 }
2753 }
2754 }
2755
2756 if (wfe_rec_none) {
2757 wfet = 0;
2758 }
2759
2760 if (wfe_rec_override_mat) {
2761 wfet = wfe_rec_override_mat;
2762 }
2763 #endif
2764 return wfet;
2765 }
2766
2767 __pure2 bool
ml_addr_in_non_xnu_stack(__unused uintptr_t addr)2768 ml_addr_in_non_xnu_stack(__unused uintptr_t addr)
2769 {
2770 #if XNU_MONITOR
2771 return (addr >= (uintptr_t)pmap_stacks_start) && (addr < (uintptr_t)pmap_stacks_end);
2772 #else
2773 return false;
2774 #endif /* XNU_MONITOR */
2775 }
2776
2777 uint64_t
ml_get_backtrace_pc(struct arm_saved_state * state)2778 ml_get_backtrace_pc(struct arm_saved_state *state)
2779 {
2780 assert((state != NULL) && is_saved_state64(state));
2781
2782
2783 return get_saved_state_pc(state);
2784 }
2785