1 /*
2 * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <arm64/machine_machdep.h>
30 #include <arm64/proc_reg.h>
31 #include <arm/machine_cpu.h>
32 #include <arm/cpu_internal.h>
33 #include <arm/cpuid.h>
34 #include <arm/io_map_entries.h>
35 #include <arm/cpu_data.h>
36 #include <arm/cpu_data_internal.h>
37 #include <arm/caches_internal.h>
38 #include <arm/misc_protos.h>
39 #include <arm/machdep_call.h>
40 #include <arm/machine_routines.h>
41 #include <arm/rtclock.h>
42 #include <arm/cpuid_internal.h>
43 #include <arm/cpu_capabilities.h>
44 #include <console/serial_protos.h>
45 #include <kern/machine.h>
46 #include <kern/misc_protos.h>
47 #include <prng/random.h>
48 #include <kern/startup.h>
49 #include <kern/thread.h>
50 #include <kern/timer_queue.h>
51 #include <mach/machine.h>
52 #include <machine/atomic.h>
53 #include <machine/config.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_shared_region.h>
57 #include <vm/vm_map.h>
58 #include <sys/codesign.h>
59 #include <sys/kdebug.h>
60 #include <kern/coalition.h>
61 #include <pexpert/device_tree.h>
62
63 #include <IOKit/IOPlatformExpert.h>
64 #if HIBERNATION
65 #include <IOKit/IOHibernatePrivate.h>
66 #endif /* HIBERNATION */
67
68 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
69 #include <arm64/amcc_rorgn.h>
70 #endif
71
72
73 #include <libkern/section_keywords.h>
74
75 /**
76 * On supported hardware, debuggable builds make the HID bits read-only
77 * without locking them. This lets people manually modify HID bits while
78 * debugging, since they can use a debugging tool to first reset the HID
79 * bits back to read/write. However it will still catch xnu changes that
80 * accidentally write to HID bits after they've been made read-only.
81 */
82
83 #if KPC
84 #include <kern/kpc.h>
85 #endif
86
87 #define MPIDR_CPU_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
88 #define MPIDR_CLUSTER_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
89
90 #if HAS_CLUSTER
91 static uint8_t cluster_initialized = 0;
92 #endif
93
94 MACHINE_TIMEOUT32_WRITEABLE(LockTimeOut, "lock", 6e6 /* 0.25s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
95 machine_timeout32_t LockTimeOutUsec; // computed in ml_init_lock_timeout
96
97 MACHINE_TIMEOUT_WRITEABLE(TLockTimeOut, "ticket-lock", 3e6 /* 0.125s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
98
99 MACHINE_TIMEOUT32_WRITEABLE(MutexSpin, "mutex-spin", 240 /* 10us */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
100
101 uint64_t low_MutexSpin;
102 int64_t high_MutexSpin;
103
104
105
106 static uint64_t ml_wfe_hint_max_interval;
107 #define MAX_WFE_HINT_INTERVAL_US (500ULL)
108
109 /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
110 TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
111
112 extern vm_offset_t segLOWEST;
113 extern vm_offset_t segLOWESTTEXT;
114 extern vm_offset_t segLASTB;
115 extern unsigned long segSizeLAST;
116
117 /* ARM64 specific bounds; used to test for presence in the kernelcache. */
118 extern vm_offset_t vm_kernelcache_base;
119 extern vm_offset_t vm_kernelcache_top;
120
121 #if defined(HAS_IPI)
122 unsigned int gFastIPI = 1;
123 #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
124 static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
125 kDeferredIPITimerDefault);
126 #endif /* defined(HAS_IPI) */
127
128 thread_t Idle_context(void);
129
130 SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
131
132 SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
133 SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
134 SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
135 .version = CPU_TOPOLOGY_VERSION,
136 .cpus = topology_cpu_array,
137 .clusters = topology_cluster_array,
138 };
139
140 _Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
141
142 /**
143 * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
144 * entries of an arbitrary data type. This is intended for use by specialized consumers
145 * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
146 * as follows:
147 * hypothetical_array[cluster_offsets[AFF1] + AFF0]
148 * Most consumers should instead use general-purpose facilities such as PERCPU or
149 * ml_get_cpu_number().
150 */
151 SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
152
153 SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
154
155 extern uint32_t lockdown_done;
156
157 /**
158 * Represents regions of virtual address space that should be reserved
159 * (pre-mapped) in each user address space.
160 */
161 SECURITY_READ_ONLY_LATE(static struct vm_reserved_region) vm_reserved_regions[] = {
162 {
163 .vmrr_name = "GPU Carveout",
164 .vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
165 .vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
166 },
167 /*
168 * Reserve the virtual memory space representing the commpage nesting region
169 * to prevent user processes from allocating memory within it. The actual
170 * page table entries for the commpage are inserted by vm_commpage_enter().
171 * This vm_map_enter() just prevents userspace from allocating/deallocating
172 * anything within the entire commpage nested region.
173 */
174 {
175 .vmrr_name = "commpage nesting",
176 .vmrr_addr = _COMM_PAGE64_NESTING_START,
177 .vmrr_size = _COMM_PAGE64_NESTING_SIZE
178 }
179 };
180
181 uint32_t get_arm_cpu_version(void);
182
183 #if defined(HAS_IPI)
184 static inline void
ml_cpu_signal_type(unsigned int cpu_mpidr,uint32_t type)185 ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
186 {
187 #if HAS_CLUSTER
188 uint64_t local_mpidr;
189 /* NOTE: this logic expects that we are called in a non-preemptible
190 * context, or at least one in which the calling thread is bound
191 * to a single CPU. Otherwise we may migrate between choosing which
192 * IPI mechanism to use and issuing the IPI. */
193 MRS(local_mpidr, "MPIDR_EL1");
194 if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
195 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
196 MSR("S3_5_C15_C0_0", x);
197 } else {
198 #define IPI_RR_TARGET_CLUSTER_SHIFT 16
199 uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
200 MSR("S3_5_C15_C0_1", x);
201 }
202 #else
203 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
204 MSR("S3_5_C15_C0_1", x);
205 #endif
206 }
207 #endif
208
209 #if !defined(HAS_IPI)
210 __dead2
211 #endif
212 void
ml_cpu_signal(unsigned int cpu_mpidr __unused)213 ml_cpu_signal(unsigned int cpu_mpidr __unused)
214 {
215 #if defined(HAS_IPI)
216 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
217 #else
218 panic("Platform does not support ACC Fast IPI");
219 #endif
220 }
221
222 #if !defined(HAS_IPI)
223 __dead2
224 #endif
225 void
ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)226 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
227 {
228 #if defined(HAS_IPI)
229 /* adjust IPI_CR timer countdown value for deferred IPI
230 * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
231 * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
232 *
233 * global register, should only require a single write to update all
234 * CPU cores: from Skye ACC user spec section 5.7.3.3
235 *
236 * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
237 * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
238 */
239 uint64_t abstime;
240
241 nanoseconds_to_absolutetime(nanosecs, &abstime);
242
243 abstime = MIN(abstime, 0xFFFF);
244
245 /* update deferred_ipi_timer_ns with the new clamped value */
246 absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
247
248 MSR("S3_5_C15_C3_1", abstime);
249 #else
250 (void)nanosecs;
251 panic("Platform does not support ACC Fast IPI");
252 #endif
253 }
254
255 uint64_t
ml_cpu_signal_deferred_get_timer()256 ml_cpu_signal_deferred_get_timer()
257 {
258 #if defined(HAS_IPI)
259 return deferred_ipi_timer_ns;
260 #else
261 return 0;
262 #endif
263 }
264
265 #if !defined(HAS_IPI)
266 __dead2
267 #endif
268 void
ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)269 ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
270 {
271 #if defined(HAS_IPI)
272 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
273 #else
274 panic("Platform does not support ACC Fast IPI deferral");
275 #endif
276 }
277
278 #if !defined(HAS_IPI)
279 __dead2
280 #endif
281 void
ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)282 ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
283 {
284 #if defined(HAS_IPI)
285 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
286 #else
287 panic("Platform does not support ACC Fast IPI retraction");
288 #endif
289 }
290
291 extern uint32_t idle_proximate_io_wfe_unmasked;
292
293 #define CPUPM_IDLE_WFE 0x5310300
294 static bool
wfe_process_recommendation(void)295 wfe_process_recommendation(void)
296 {
297 bool ipending = false;
298 if (__probable(idle_proximate_io_wfe_unmasked == 1)) {
299 /* Check for an active perf. controller generated
300 * WFE recommendation for this cluster.
301 */
302 cpu_data_t *cdp = getCpuDatap();
303 uint32_t cid = cdp->cpu_cluster_id;
304 uint64_t wfe_ttd = 0;
305 uint64_t wfe_deadline = 0;
306
307 if ((wfe_ttd = ml_cluster_wfe_timeout(cid)) != 0) {
308 wfe_deadline = mach_absolute_time() + wfe_ttd;
309 }
310
311 if (wfe_deadline != 0) {
312 /* Poll issuing event-bounded WFEs until an interrupt
313 * arrives or the WFE recommendation expires
314 */
315 #if DEVELOPMENT || DEBUG
316 uint64_t wc = cdp->wfe_count;
317 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
318 #endif
319 /* Issue WFE until the recommendation expires,
320 * with IRQs unmasked.
321 */
322 ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true);
323 #if DEVELOPMENT || DEBUG
324 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
325 #endif
326 }
327 }
328 return ipending;
329 }
330
331 void
machine_idle(void)332 machine_idle(void)
333 {
334 /* Interrupts are expected to be masked on entry or re-entry via
335 * Idle_load_context()
336 */
337 assert((__builtin_arm_rsr("DAIF") & (DAIF_IRQF | DAIF_FIQF)) == (DAIF_IRQF | DAIF_FIQF));
338 /* Check for, and act on, a WFE recommendation.
339 * Bypasses context spill/fill for a minor perf. increment.
340 * May unmask and restore IRQ+FIQ mask.
341 */
342 if (wfe_process_recommendation() == false) {
343 /* If WFE recommendation absent, or WFE deadline
344 * arrived with no interrupt pending/processed,
345 * fall back to WFI.
346 */
347 Idle_context();
348 }
349 __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
350 }
351
352 void
OSSynchronizeIO(void)353 OSSynchronizeIO(void)
354 {
355 __builtin_arm_dsb(DSB_SY);
356 }
357
358 uint64_t
get_aux_control(void)359 get_aux_control(void)
360 {
361 uint64_t value;
362
363 MRS(value, "ACTLR_EL1");
364 return value;
365 }
366
367 uint64_t
get_mmu_control(void)368 get_mmu_control(void)
369 {
370 uint64_t value;
371
372 MRS(value, "SCTLR_EL1");
373 return value;
374 }
375
376 uint64_t
get_tcr(void)377 get_tcr(void)
378 {
379 uint64_t value;
380
381 MRS(value, "TCR_EL1");
382 return value;
383 }
384
385 boolean_t
ml_get_interrupts_enabled(void)386 ml_get_interrupts_enabled(void)
387 {
388 uint64_t value;
389
390 MRS(value, "DAIF");
391 if (value & DAIF_IRQF) {
392 return FALSE;
393 }
394 return TRUE;
395 }
396
397 pmap_paddr_t
get_mmu_ttb(void)398 get_mmu_ttb(void)
399 {
400 pmap_paddr_t value;
401
402 MRS(value, "TTBR0_EL1");
403 return value;
404 }
405
406 uint32_t
get_arm_cpu_version(void)407 get_arm_cpu_version(void)
408 {
409 uint32_t value = machine_read_midr();
410
411 /* Compose the register values into 8 bits; variant[7:4], revision[3:0]. */
412 return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
413 }
414
415 bool
ml_feature_supported(uint32_t feature_bit)416 ml_feature_supported(uint32_t feature_bit)
417 {
418 uint64_t aidr_el1_value = 0;
419
420 MRS(aidr_el1_value, "AIDR_EL1");
421
422
423 return aidr_el1_value & feature_bit;
424 }
425
426 /*
427 * user_cont_hwclock_allowed()
428 *
429 * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
430 * as a continuous time source (e.g. from mach_continuous_time)
431 */
432 boolean_t
user_cont_hwclock_allowed(void)433 user_cont_hwclock_allowed(void)
434 {
435 #if HAS_CONTINUOUS_HWCLOCK
436 return TRUE;
437 #else
438 return FALSE;
439 #endif
440 }
441
442 /*
443 * user_timebase_type()
444 *
445 * Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
446 *
447 * USER_TIMEBASE_NONE: EL0 has no access to timebase register
448 * USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
449 * USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
450 *
451 */
452
453 uint8_t
user_timebase_type(void)454 user_timebase_type(void)
455 {
456 #if __ARM_ARCH_8_6__
457 return USER_TIMEBASE_NOSPEC;
458 #else
459 return USER_TIMEBASE_SPEC;
460 #endif
461 }
462
463 void
machine_startup(__unused boot_args * args)464 machine_startup(__unused boot_args * args)
465 {
466 #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
467 if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
468 gFastIPI = 1;
469 }
470 #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
471
472
473 machine_conf();
474
475
476 /*
477 * Kick off the kernel bootstrap.
478 */
479 kernel_bootstrap();
480 /* NOTREACHED */
481 }
482
483 typedef void (*invalidate_fn_t)(void);
484
485 static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
486
487 void set_invalidate_hmac_function(invalidate_fn_t fn);
488
489 void
set_invalidate_hmac_function(invalidate_fn_t fn)490 set_invalidate_hmac_function(invalidate_fn_t fn)
491 {
492 if (NULL != invalidate_hmac_function) {
493 panic("Invalidate HMAC function already set");
494 }
495
496 invalidate_hmac_function = fn;
497 }
498
499 void
machine_lockdown(void)500 machine_lockdown(void)
501 {
502 arm_vm_prot_finalize(PE_state.bootArgs);
503
504 #if CONFIG_KERNEL_INTEGRITY
505 #if KERNEL_INTEGRITY_WT
506 /* Watchtower
507 *
508 * Notify the monitor about the completion of early kernel bootstrap.
509 * From this point forward it will enforce the integrity of kernel text,
510 * rodata and page tables.
511 */
512
513 #ifdef MONITOR
514 monitor_call(MONITOR_LOCKDOWN, 0, 0, 0);
515 #endif
516 #endif /* KERNEL_INTEGRITY_WT */
517
518 #if XNU_MONITOR
519 pmap_lockdown_ppl();
520 #endif
521
522 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
523 /* KTRR
524 *
525 * Lock physical KTRR region. KTRR region is read-only. Memory outside
526 * the region is not executable at EL1.
527 */
528
529 rorgn_lockdown();
530 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
531
532
533 #endif /* CONFIG_KERNEL_INTEGRITY */
534
535
536 if (NULL != invalidate_hmac_function) {
537 invalidate_hmac_function();
538 }
539
540 lockdown_done = 1;
541 }
542
543
544 char *
machine_boot_info(__unused char * buf,__unused vm_size_t size)545 machine_boot_info(
546 __unused char *buf,
547 __unused vm_size_t size)
548 {
549 return PE_boot_args();
550 }
551
552 void
slave_machine_init(__unused void * param)553 slave_machine_init(__unused void *param)
554 {
555 cpu_machine_init(); /* Initialize the processor */
556 clock_init(); /* Init the clock */
557 }
558
559 /*
560 * Routine: machine_processor_shutdown
561 * Function:
562 */
563 thread_t
machine_processor_shutdown(__unused thread_t thread,void (* doshutdown)(processor_t),processor_t processor)564 machine_processor_shutdown(
565 __unused thread_t thread,
566 void (*doshutdown)(processor_t),
567 processor_t processor)
568 {
569 return Shutdown_context(doshutdown, processor);
570 }
571
572 /*
573 * Routine: ml_init_lock_timeout
574 * Function:
575 */
576 void
ml_init_lock_timeout(void)577 ml_init_lock_timeout(void)
578 {
579 /*
580 * This function is called after STARUP_SUB_TIMEOUTS
581 * initialization, so using the "legacy" boot-args here overrides
582 * the ml-timeout-... configuration. (Given that these boot-args
583 * here are usually explicitly specified, this makes sense by
584 * overriding ml-timeout-..., which may come from the device tree.
585 */
586
587 uint64_t lto_timeout_ns;
588 uint64_t lto_abstime;
589 uint32_t slto;
590
591 if (PE_parse_boot_argn("slto_us", &slto, sizeof(slto))) {
592 lto_timeout_ns = slto * NSEC_PER_USEC;
593 nanoseconds_to_absolutetime(lto_timeout_ns, <o_abstime);
594 os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
595 } else {
596 lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
597 absolutetime_to_nanoseconds(lto_abstime, <o_timeout_ns);
598 }
599
600 os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
601
602 if (PE_parse_boot_argn("tlto_us", &slto, sizeof(slto))) {
603 nanoseconds_to_absolutetime(slto * NSEC_PER_USEC, <o_abstime);
604 os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
605 } else if (lto_abstime != 0) {
606 os_atomic_store(&TLockTimeOut, lto_abstime >> 1, relaxed);
607 } // else take default from MACHINE_TIMEOUT.
608
609 uint64_t mtxspin;
610 uint64_t mtx_abstime;
611 if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof(mtxspin))) {
612 if (mtxspin > USEC_PER_SEC >> 4) {
613 mtxspin = USEC_PER_SEC >> 4;
614 }
615 nanoseconds_to_absolutetime(mtxspin * NSEC_PER_USEC, &mtx_abstime);
616 os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
617 } else {
618 mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
619 }
620
621 low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
622 /*
623 * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
624 * real_ncpus is not set at this time
625 *
626 * NOTE: active spinning is disabled in arm. It can be activated
627 * by setting high_MutexSpin through the sysctl.
628 */
629 high_MutexSpin = low_MutexSpin;
630
631 uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
632 PE_parse_boot_argn("max_wfe_us", &maxwfeus, sizeof(maxwfeus));
633 nanoseconds_to_absolutetime(maxwfeus * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
634 }
635
636 /*
637 * This is called when all of the ml_processor_info_t structures have been
638 * initialized and all the processors have been started through processor_start().
639 *
640 * Required by the scheduler subsystem.
641 */
642 void
ml_cpu_init_completed(void)643 ml_cpu_init_completed(void)
644 {
645 if (SCHED(cpu_init_completed) != NULL) {
646 SCHED(cpu_init_completed)();
647 }
648 }
649
650 /*
651 * This is called from the machine-independent routine cpu_up()
652 * to perform machine-dependent info updates.
653 */
654 void
ml_cpu_up(void)655 ml_cpu_up(void)
656 {
657 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[ml_get_cpu_number_local()];
658
659 os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
660
661 os_atomic_inc(&machine_info.physical_cpu, relaxed);
662 os_atomic_inc(&machine_info.logical_cpu, relaxed);
663 }
664
665 /*
666 * This is called from the machine-independent routine cpu_down()
667 * to perform machine-dependent info updates.
668 */
669 void
ml_cpu_down(void)670 ml_cpu_down(void)
671 {
672 cpu_data_t *cpu_data_ptr;
673 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[ml_get_cpu_number_local()];
674
675 os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
676
677 os_atomic_dec(&machine_info.physical_cpu, relaxed);
678 os_atomic_dec(&machine_info.logical_cpu, relaxed);
679
680 /*
681 * If we want to deal with outstanding IPIs, we need to
682 * do relatively early in the processor_doshutdown path,
683 * as we pend decrementer interrupts using the IPI
684 * mechanism if we cannot immediately service them (if
685 * IRQ is masked). Do so now.
686 *
687 * We aren't on the interrupt stack here; would it make
688 * more sense to disable signaling and then enable
689 * interrupts? It might be a bit cleaner.
690 */
691 cpu_data_ptr = getCpuDatap();
692 cpu_data_ptr->cpu_running = FALSE;
693
694 if (cpu_data_ptr != &BootCpuData) {
695 /*
696 * Move all of this cpu's timers to the master/boot cpu,
697 * and poke it in case there's a sooner deadline for it to schedule.
698 */
699 timer_queue_shutdown(&cpu_data_ptr->rtclock_timer.queue);
700 cpu_xcall(BootCpuData.cpu_number, &timer_queue_expire_local, NULL);
701 }
702
703 cpu_signal_handler_internal(TRUE);
704 }
705
706 unsigned int
ml_get_machine_mem(void)707 ml_get_machine_mem(void)
708 {
709 return machine_info.memory_size;
710 }
711
712 __attribute__((noreturn))
713 void
halt_all_cpus(boolean_t reboot)714 halt_all_cpus(boolean_t reboot)
715 {
716 if (reboot) {
717 printf("MACH Reboot\n");
718 PEHaltRestart(kPERestartCPU);
719 } else {
720 printf("CPU halted\n");
721 PEHaltRestart(kPEHaltCPU);
722 }
723 while (1) {
724 ;
725 }
726 }
727
728 __attribute__((noreturn))
729 void
halt_cpu(void)730 halt_cpu(void)
731 {
732 halt_all_cpus(FALSE);
733 }
734
735 /*
736 * Routine: machine_signal_idle
737 * Function:
738 */
739 void
machine_signal_idle(processor_t processor)740 machine_signal_idle(
741 processor_t processor)
742 {
743 cpu_signal(processor_to_cpu_datap(processor), SIGPnop, (void *)NULL, (void *)NULL);
744 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
745 }
746
747 void
machine_signal_idle_deferred(processor_t processor)748 machine_signal_idle_deferred(
749 processor_t processor)
750 {
751 cpu_signal_deferred(processor_to_cpu_datap(processor));
752 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
753 }
754
755 void
machine_signal_idle_cancel(processor_t processor)756 machine_signal_idle_cancel(
757 processor_t processor)
758 {
759 cpu_signal_cancel(processor_to_cpu_datap(processor));
760 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
761 }
762
763 /*
764 * Routine: ml_install_interrupt_handler
765 * Function: Initialize Interrupt Handler
766 */
767 void
ml_install_interrupt_handler(void * nub,int source,void * target,IOInterruptHandler handler,void * refCon)768 ml_install_interrupt_handler(
769 void *nub,
770 int source,
771 void *target,
772 IOInterruptHandler handler,
773 void *refCon)
774 {
775 cpu_data_t *cpu_data_ptr;
776 boolean_t current_state;
777
778 current_state = ml_set_interrupts_enabled(FALSE);
779 cpu_data_ptr = getCpuDatap();
780
781 cpu_data_ptr->interrupt_nub = nub;
782 cpu_data_ptr->interrupt_source = source;
783 cpu_data_ptr->interrupt_target = target;
784 cpu_data_ptr->interrupt_handler = handler;
785 cpu_data_ptr->interrupt_refCon = refCon;
786
787 (void) ml_set_interrupts_enabled(current_state);
788 }
789
790 /*
791 * Routine: ml_init_interrupt
792 * Function: Initialize Interrupts
793 */
794 void
ml_init_interrupt(void)795 ml_init_interrupt(void)
796 {
797 #if defined(HAS_IPI)
798 /*
799 * ml_init_interrupt will get called once for each CPU, but this is redundant
800 * because there is only one global copy of the register for skye. do it only
801 * on the bootstrap cpu
802 */
803 if (getCpuDatap()->cluster_master) {
804 ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
805 }
806 #endif
807 }
808
809 /*
810 * Routine: ml_init_timebase
811 * Function: register and setup Timebase, Decremeter services
812 */
813 void
ml_init_timebase(void * args,tbd_ops_t tbd_funcs,vm_offset_t int_address,vm_offset_t int_value __unused)814 ml_init_timebase(
815 void *args,
816 tbd_ops_t tbd_funcs,
817 vm_offset_t int_address,
818 vm_offset_t int_value __unused)
819 {
820 cpu_data_t *cpu_data_ptr;
821
822 cpu_data_ptr = (cpu_data_t *)args;
823
824 if ((cpu_data_ptr == &BootCpuData)
825 && (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
826 rtclock_timebase_func = *tbd_funcs;
827 rtclock_timebase_addr = int_address;
828 }
829 }
830
831 #define ML_READPROP_MANDATORY UINT64_MAX
832
833 static uint64_t
ml_readprop(const DTEntry entry,const char * propertyName,uint64_t default_value)834 ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
835 {
836 void const *prop;
837 unsigned int propSize;
838
839 if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) {
840 if (propSize == sizeof(uint8_t)) {
841 return *((uint8_t const *)prop);
842 } else if (propSize == sizeof(uint16_t)) {
843 return *((uint16_t const *)prop);
844 } else if (propSize == sizeof(uint32_t)) {
845 return *((uint32_t const *)prop);
846 } else if (propSize == sizeof(uint64_t)) {
847 return *((uint64_t const *)prop);
848 } else {
849 panic("CPU property '%s' has bad size %u", propertyName, propSize);
850 }
851 } else {
852 if (default_value == ML_READPROP_MANDATORY) {
853 panic("Missing mandatory property '%s'", propertyName);
854 }
855 return default_value;
856 }
857 }
858
859 static boolean_t
ml_read_reg_range(const DTEntry entry,const char * propertyName,uint64_t * pa_ptr,uint64_t * len_ptr)860 ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
861 {
862 uint64_t const *prop;
863 unsigned int propSize;
864
865 if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) {
866 return FALSE;
867 }
868
869 if (propSize != sizeof(uint64_t) * 2) {
870 panic("Wrong property size for %s", propertyName);
871 }
872
873 *pa_ptr = prop[0];
874 *len_ptr = prop[1];
875 return TRUE;
876 }
877
878 static boolean_t
ml_is_boot_cpu(const DTEntry entry)879 ml_is_boot_cpu(const DTEntry entry)
880 {
881 void const *prop;
882 unsigned int propSize;
883
884 if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) {
885 panic("unable to retrieve state for cpu");
886 }
887
888 if (strncmp((char const *)prop, "running", propSize) == 0) {
889 return TRUE;
890 } else {
891 return FALSE;
892 }
893 }
894
895 static void
ml_read_chip_revision(unsigned int * rev __unused)896 ml_read_chip_revision(unsigned int *rev __unused)
897 {
898 // The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
899 #ifdef APPLE_ARM64_ARCH_FAMILY
900 DTEntry entryP;
901
902 if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) {
903 *rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN);
904 } else {
905 *rev = CPU_VERSION_UNKNOWN;
906 }
907 #endif
908 }
909
910 void
ml_parse_cpu_topology(void)911 ml_parse_cpu_topology(void)
912 {
913 DTEntry entry, child __unused;
914 OpaqueDTEntryIterator iter;
915 uint32_t cpu_boot_arg = MAX_CPUS;
916 uint64_t cpumask_boot_arg = ULLONG_MAX;
917 int err;
918
919 int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
920 int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
921 const boolean_t cpus_boot_arg_present = PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg));
922 const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn("cpumask", &cpumask_boot_arg, sizeof(cpumask_boot_arg));
923
924 // The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
925 // so that we trigger a panic later in the boot process, once serial is enabled.
926 if (cpus_boot_arg_present && cpumask_boot_arg_present) {
927 cpu_config_correct = false;
928 }
929
930 err = SecureDTLookupEntry(NULL, "/cpus", &entry);
931 assert(err == kSuccess);
932
933 err = SecureDTInitEntryIterator(entry, &iter);
934 assert(err == kSuccess);
935
936 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
937 cluster_offsets[i] = -1;
938 cluster_phys_to_logical[i] = -1;
939 cluster_max_cpu_phys_id[i] = 0;
940 }
941
942 while (kSuccess == SecureDTIterateEntries(&iter, &child)) {
943 boolean_t is_boot_cpu = ml_is_boot_cpu(child);
944 boolean_t cpu_enabled = cpumask_boot_arg & 1;
945 cpumask_boot_arg >>= 1;
946
947 // Boot CPU disabled in cpumask. Flag this so that we trigger a panic
948 // later in the boot process, once serial is enabled.
949 if (is_boot_cpu && !cpu_enabled) {
950 cpu_config_correct = false;
951 }
952
953 // Ignore this CPU if it has been disabled by the cpumask= boot-arg.
954 if (!is_boot_cpu && !cpu_enabled) {
955 continue;
956 }
957
958 // If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
959 // been added to the topology struct yet, and we only have one slot left, then skip
960 // every other non-boot CPU in order to leave room for the boot CPU.
961 //
962 // e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
963 // array will list CPU0, CPU1, and CPU4. CPU2-CPU3 and CPU5-CPUn will be omitted.
964 if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
965 continue;
966 }
967 if (topology_info.num_cpus >= cpu_boot_arg) {
968 break;
969 }
970
971 ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
972
973 cpu->cpu_id = topology_info.num_cpus++;
974 assert(cpu->cpu_id < MAX_CPUS);
975 topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
976
977 cpu->reserved = 0;
978 topology_info.reserved = 0;
979
980 cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY);
981
982 cpu->l2_access_penalty = (uint32_t)ml_readprop(child, "l2-access-penalty", 0);
983 cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0);
984 cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0);
985 cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
986 cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
987
988 ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
989 ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
990 ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
991 cpu->cluster_type = CLUSTER_TYPE_SMP;
992
993 int cluster_type = (int)ml_readprop(child, "cluster-type", 0);
994 if (cluster_type == 'E') {
995 cpu->cluster_type = CLUSTER_TYPE_E;
996 } else if (cluster_type == 'P') {
997 cpu->cluster_type = CLUSTER_TYPE_P;
998 }
999
1000 topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1001
1002 /*
1003 * Since we want to keep a linear cluster ID space, we cannot just rely
1004 * on the value provided by EDT. Instead, use the MPIDR value to see if we have
1005 * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1006 */
1007 #if HAS_CLUSTER
1008 uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1009 #else
1010 uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1011 #endif
1012 assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1013 cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
1014 topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1015
1016 assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1017
1018 ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1019 if (cluster->num_cpus == 0) {
1020 assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1021
1022 topology_info.num_clusters++;
1023 topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1024 topology_info.cluster_types |= (1 << cpu->cluster_type);
1025
1026 cluster->cluster_id = cpu->cluster_id;
1027 cluster->cluster_type = cpu->cluster_type;
1028 cluster->first_cpu_id = cpu->cpu_id;
1029 assert(cluster_phys_to_logical[phys_cluster_id] == -1);
1030 cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1031
1032 // Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1033 // If we wind up with a bunch of these, we might want to create separate per-cluster
1034 // EDT nodes and have the CPU nodes reference them through a phandle.
1035 ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len);
1036 ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len);
1037 }
1038
1039 #if HAS_CLUSTER
1040 if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1041 cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1042 }
1043 #endif
1044
1045 cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1046 cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1047
1048 cluster->num_cpus++;
1049 cluster->cpu_mask |= 1ULL << cpu->cpu_id;
1050
1051 if (is_boot_cpu) {
1052 assert(topology_info.boot_cpu == NULL);
1053 topology_info.boot_cpu = cpu;
1054 topology_info.boot_cluster = cluster;
1055 }
1056 }
1057
1058 #if HAS_CLUSTER
1059 /*
1060 * Build the cluster offset array, ensuring that the region reserved
1061 * for each physical cluster contains enough entries to be indexed
1062 * by the maximum physical CPU ID (AFF0) within the cluster.
1063 */
1064 unsigned int cur_cluster_offset = 0;
1065 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1066 if (cluster_phys_to_logical[i] != -1) {
1067 cluster_offsets[i] = cur_cluster_offset;
1068 cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
1069 }
1070 }
1071 assert(cur_cluster_offset <= MAX_CPUS);
1072 #else
1073 /*
1074 * For H10, there are really 2 physical clusters, but they are not separated
1075 * into distinct ACCs. AFF1 therefore always reports 0, and AFF0 numbering
1076 * is linear across both clusters. For the purpose of MPIDR_EL1-based indexing,
1077 * treat H10 and earlier devices as though they contain a single cluster.
1078 */
1079 cluster_offsets[0] = 0;
1080 #endif
1081 assert(topology_info.boot_cpu != NULL);
1082 ml_read_chip_revision(&topology_info.chip_revision);
1083
1084 /*
1085 * Set TPIDR_EL0 to indicate the correct cpu number, as we may
1086 * not be booting from cpu 0. Userspace will consume the current
1087 * CPU number through this register. For non-boot cores, this is
1088 * done in start.s (start_cpu) using the cpu_number field of the
1089 * per-cpu data object.
1090 */
1091 uint64_t cpuid = topology_info.boot_cpu->cpu_id;
1092
1093 __builtin_arm_wsr64("TPIDR_EL0", cpuid & MACHDEP_TPIDR_CPUNUM_MASK);
1094 assert((cpuid & MACHDEP_TPIDR_CPUNUM_MASK) == cpuid);
1095 __builtin_arm_wsr64("TPIDRRO_EL0", 0);
1096 }
1097
1098 const ml_topology_info_t *
ml_get_topology_info(void)1099 ml_get_topology_info(void)
1100 {
1101 return &topology_info;
1102 }
1103
1104 void
ml_map_cpu_pio(void)1105 ml_map_cpu_pio(void)
1106 {
1107 unsigned int i;
1108
1109 for (i = 0; i < topology_info.num_cpus; i++) {
1110 ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1111 if (cpu->cpu_IMPL_pa) {
1112 cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len);
1113 cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len);
1114 }
1115 if (cpu->cpu_UTTDBG_pa) {
1116 cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len);
1117 }
1118 }
1119
1120 for (i = 0; i < topology_info.num_clusters; i++) {
1121 ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1122 if (cluster->acc_IMPL_pa) {
1123 cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len);
1124 }
1125 if (cluster->cpm_IMPL_pa) {
1126 cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len);
1127 }
1128 }
1129 }
1130
1131 unsigned int
ml_get_cpu_count(void)1132 ml_get_cpu_count(void)
1133 {
1134 return topology_info.num_cpus;
1135 }
1136
1137 unsigned int
ml_get_cluster_count(void)1138 ml_get_cluster_count(void)
1139 {
1140 return topology_info.num_clusters;
1141 }
1142
1143 int
ml_get_boot_cpu_number(void)1144 ml_get_boot_cpu_number(void)
1145 {
1146 return topology_info.boot_cpu->cpu_id;
1147 }
1148
1149 cluster_type_t
ml_get_boot_cluster_type(void)1150 ml_get_boot_cluster_type(void)
1151 {
1152 return topology_info.boot_cluster->cluster_type;
1153 }
1154
1155 int
ml_get_cpu_number(uint32_t phys_id)1156 ml_get_cpu_number(uint32_t phys_id)
1157 {
1158 phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
1159
1160 for (unsigned i = 0; i < topology_info.num_cpus; i++) {
1161 if (topology_info.cpus[i].phys_id == phys_id) {
1162 return i;
1163 }
1164 }
1165
1166 return -1;
1167 }
1168
1169 int
ml_get_cluster_number(uint32_t phys_id)1170 ml_get_cluster_number(uint32_t phys_id)
1171 {
1172 int cpu_id = ml_get_cpu_number(phys_id);
1173 if (cpu_id < 0) {
1174 return -1;
1175 }
1176
1177 ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1178
1179 return cpu->cluster_id;
1180 }
1181
1182 unsigned int
ml_get_cpu_number_local(void)1183 ml_get_cpu_number_local(void)
1184 {
1185 uint64_t mpidr_el1_value = 0;
1186 unsigned cpu_id;
1187
1188 /* We identify the CPU based on the constant bits of MPIDR_EL1. */
1189 MRS(mpidr_el1_value, "MPIDR_EL1");
1190 cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value);
1191
1192 assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1193
1194 return cpu_id;
1195 }
1196
1197 int
ml_get_cluster_number_local()1198 ml_get_cluster_number_local()
1199 {
1200 uint64_t mpidr_el1_value = 0;
1201 unsigned cluster_id;
1202
1203 /* We identify the cluster based on the constant bits of MPIDR_EL1. */
1204 MRS(mpidr_el1_value, "MPIDR_EL1");
1205 cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value);
1206
1207 assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1208
1209 return cluster_id;
1210 }
1211
1212 int
ml_get_max_cpu_number(void)1213 ml_get_max_cpu_number(void)
1214 {
1215 return topology_info.max_cpu_id;
1216 }
1217
1218 int
ml_get_max_cluster_number(void)1219 ml_get_max_cluster_number(void)
1220 {
1221 return topology_info.max_cluster_id;
1222 }
1223
1224 unsigned int
ml_get_first_cpu_id(unsigned int cluster_id)1225 ml_get_first_cpu_id(unsigned int cluster_id)
1226 {
1227 return topology_info.clusters[cluster_id].first_cpu_id;
1228 }
1229
1230
1231 void
ml_lockdown_init()1232 ml_lockdown_init()
1233 {
1234 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
1235 rorgn_stash_range();
1236 #endif
1237 }
1238
1239 kern_return_t
ml_lockdown_handler_register(lockdown_handler_t f,void * this)1240 ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1241 {
1242 if (!f) {
1243 return KERN_FAILURE;
1244 }
1245
1246 assert(lockdown_done);
1247 f(this); // XXX: f this whole function
1248
1249 return KERN_SUCCESS;
1250 }
1251
1252 kern_return_t
ml_processor_register(ml_processor_info_t * in_processor_info,processor_t * processor_out,ipi_handler_t * ipi_handler_out,perfmon_interrupt_handler_func * pmi_handler_out)1253 ml_processor_register(ml_processor_info_t *in_processor_info,
1254 processor_t *processor_out, ipi_handler_t *ipi_handler_out,
1255 perfmon_interrupt_handler_func *pmi_handler_out)
1256 {
1257 cpu_data_t *this_cpu_datap;
1258 processor_set_t pset;
1259 boolean_t is_boot_cpu;
1260 static unsigned int reg_cpu_count = 0;
1261
1262 if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1263 return KERN_FAILURE;
1264 }
1265
1266 if ((unsigned)OSIncrementAtomic((SInt32*)®_cpu_count) >= topology_info.num_cpus) {
1267 return KERN_FAILURE;
1268 }
1269
1270 if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1271 is_boot_cpu = FALSE;
1272 this_cpu_datap = cpu_data_alloc(FALSE);
1273 cpu_data_init(this_cpu_datap);
1274 } else {
1275 this_cpu_datap = &BootCpuData;
1276 is_boot_cpu = TRUE;
1277 }
1278
1279 assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1280
1281 this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1282
1283 if (!is_boot_cpu) {
1284 this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1285
1286 if (cpu_data_register(this_cpu_datap) != KERN_SUCCESS) {
1287 goto processor_register_error;
1288 }
1289 assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1290 }
1291
1292 this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1293 this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1294 nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency);
1295 this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr);
1296
1297 this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1298 this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1299
1300 this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1301 this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1302 this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1303 this_cpu_datap->cpu_l2_access_penalty = in_processor_info->l2_access_penalty;
1304
1305 this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1306 this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1307 this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1308 this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1309 this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1310 this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1311
1312 #if HAS_CLUSTER
1313 this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1314 #else /* HAS_CLUSTER */
1315 this_cpu_datap->cluster_master = is_boot_cpu;
1316 #endif /* HAS_CLUSTER */
1317 pset = pset_find(in_processor_info->cluster_id, NULL);
1318 if (pset == NULL) {
1319 #if __AMP__
1320 pset_cluster_type_t pset_cluster_type = this_cpu_datap->cpu_cluster_type == CLUSTER_TYPE_E ? PSET_AMP_E : PSET_AMP_P;
1321 pset = pset_create(ml_get_boot_cluster_type() == this_cpu_datap->cpu_cluster_type ? &pset_node0 : &pset_node1, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1322 assert(pset != PROCESSOR_SET_NULL);
1323 #else /* __AMP__ */
1324 pset_cluster_type_t pset_cluster_type = PSET_SMP;
1325 pset = pset_create(&pset_node0, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1326 assert(pset != PROCESSOR_SET_NULL);
1327 #endif /* __AMP__ */
1328 }
1329 kprintf("%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1330
1331 processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1332 if (!is_boot_cpu) {
1333 processor_init(processor, this_cpu_datap->cpu_number, pset);
1334
1335 if (this_cpu_datap->cpu_l2_access_penalty) {
1336 /*
1337 * Cores that have a non-zero L2 access penalty compared
1338 * to the boot processor should be de-prioritized by the
1339 * scheduler, so that threads use the cores with better L2
1340 * preferentially.
1341 */
1342 processor_set_primary(processor, master_processor);
1343 }
1344 }
1345
1346 *processor_out = processor;
1347 *ipi_handler_out = cpu_signal_handler;
1348 #if CPMU_AIC_PMI && MONOTONIC
1349 *pmi_handler_out = mt_cpmu_aic_pmi;
1350 #else
1351 *pmi_handler_out = NULL;
1352 #endif /* CPMU_AIC_PMI && MONOTONIC */
1353 if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1354 *in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1355 }
1356
1357 #if KPC
1358 if (kpc_register_cpu(this_cpu_datap) != TRUE) {
1359 goto processor_register_error;
1360 }
1361 #endif /* KPC */
1362
1363 if (!is_boot_cpu) {
1364 random_cpu_init(this_cpu_datap->cpu_number);
1365 // now let next CPU register itself
1366 OSIncrementAtomic((SInt32*)&real_ncpus);
1367 }
1368
1369 return KERN_SUCCESS;
1370
1371 processor_register_error:
1372 #if KPC
1373 kpc_unregister_cpu(this_cpu_datap);
1374 #endif /* KPC */
1375 if (!is_boot_cpu) {
1376 cpu_data_free(this_cpu_datap);
1377 }
1378
1379 return KERN_FAILURE;
1380 }
1381
1382 void
ml_init_arm_debug_interface(void * in_cpu_datap,vm_offset_t virt_address)1383 ml_init_arm_debug_interface(
1384 void * in_cpu_datap,
1385 vm_offset_t virt_address)
1386 {
1387 ((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1388 do_debugid();
1389 }
1390
1391 /*
1392 * Routine: init_ast_check
1393 * Function:
1394 */
1395 void
init_ast_check(__unused processor_t processor)1396 init_ast_check(
1397 __unused processor_t processor)
1398 {
1399 }
1400
1401 /*
1402 * Routine: cause_ast_check
1403 * Function:
1404 */
1405 void
cause_ast_check(processor_t processor)1406 cause_ast_check(
1407 processor_t processor)
1408 {
1409 if (current_processor() != processor) {
1410 cpu_signal(processor_to_cpu_datap(processor), SIGPast, (void *)NULL, (void *)NULL);
1411 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 1 /* ast */, 0, 0, 0);
1412 }
1413 }
1414
1415 extern uint32_t cpu_idle_count;
1416
1417 void
ml_get_power_state(boolean_t * icp,boolean_t * pidlep)1418 ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
1419 {
1420 *icp = ml_at_interrupt_context();
1421 *pidlep = (cpu_idle_count == real_ncpus);
1422 }
1423
1424 /*
1425 * Routine: ml_cause_interrupt
1426 * Function: Generate a fake interrupt
1427 */
1428 void
ml_cause_interrupt(void)1429 ml_cause_interrupt(void)
1430 {
1431 return; /* BS_XXX */
1432 }
1433
1434 /* Map memory map IO space */
1435 vm_offset_t
ml_io_map(vm_offset_t phys_addr,vm_size_t size)1436 ml_io_map(
1437 vm_offset_t phys_addr,
1438 vm_size_t size)
1439 {
1440 return io_map(phys_addr, size, VM_WIMG_IO);
1441 }
1442
1443 /* Map memory map IO space (with protections specified) */
1444 vm_offset_t
ml_io_map_with_prot(vm_offset_t phys_addr,vm_size_t size,vm_prot_t prot)1445 ml_io_map_with_prot(
1446 vm_offset_t phys_addr,
1447 vm_size_t size,
1448 vm_prot_t prot)
1449 {
1450 return io_map_with_prot(phys_addr, size, VM_WIMG_IO, prot);
1451 }
1452
1453 vm_offset_t
ml_io_map_wcomb(vm_offset_t phys_addr,vm_size_t size)1454 ml_io_map_wcomb(
1455 vm_offset_t phys_addr,
1456 vm_size_t size)
1457 {
1458 return io_map(phys_addr, size, VM_WIMG_WCOMB);
1459 }
1460
1461 void
ml_io_unmap(vm_offset_t addr,vm_size_t sz)1462 ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1463 {
1464 pmap_remove(kernel_pmap, addr, addr + sz);
1465 kmem_free(kernel_map, addr, sz);
1466 }
1467
1468 /* boot memory allocation */
1469 vm_offset_t
ml_static_malloc(__unused vm_size_t size)1470 ml_static_malloc(
1471 __unused vm_size_t size)
1472 {
1473 return (vm_offset_t) NULL;
1474 }
1475
1476 vm_map_address_t
ml_map_high_window(vm_offset_t phys_addr,vm_size_t len)1477 ml_map_high_window(
1478 vm_offset_t phys_addr,
1479 vm_size_t len)
1480 {
1481 return pmap_map_high_window_bd(phys_addr, len, VM_PROT_READ | VM_PROT_WRITE);
1482 }
1483
1484 vm_offset_t
ml_static_ptovirt(vm_offset_t paddr)1485 ml_static_ptovirt(
1486 vm_offset_t paddr)
1487 {
1488 return phystokv(paddr);
1489 }
1490
1491 vm_offset_t
ml_static_slide(vm_offset_t vaddr)1492 ml_static_slide(
1493 vm_offset_t vaddr)
1494 {
1495 vm_offset_t slid_vaddr = vaddr + vm_kernel_slide;
1496
1497 if ((slid_vaddr < vm_kernelcache_base) || (slid_vaddr >= vm_kernelcache_top)) {
1498 /* This is only intended for use on kernelcache addresses. */
1499 return 0;
1500 }
1501
1502 /*
1503 * Because the address is in the kernelcache, we can do a simple
1504 * slide calculation.
1505 */
1506 return slid_vaddr;
1507 }
1508
1509 vm_offset_t
ml_static_unslide(vm_offset_t vaddr)1510 ml_static_unslide(
1511 vm_offset_t vaddr)
1512 {
1513 if ((vaddr < vm_kernelcache_base) || (vaddr >= vm_kernelcache_top)) {
1514 /* This is only intended for use on kernelcache addresses. */
1515 return 0;
1516 }
1517
1518 return vaddr - vm_kernel_slide;
1519 }
1520
1521 extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1522
1523 kern_return_t
ml_static_protect(vm_offset_t vaddr,vm_size_t size,vm_prot_t new_prot)1524 ml_static_protect(
1525 vm_offset_t vaddr, /* kernel virtual address */
1526 vm_size_t size,
1527 vm_prot_t new_prot)
1528 {
1529 pt_entry_t arm_prot = 0;
1530 pt_entry_t arm_block_prot = 0;
1531 vm_offset_t vaddr_cur;
1532 ppnum_t ppn;
1533 kern_return_t result = KERN_SUCCESS;
1534
1535 if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1536 panic("ml_static_protect(): %p < %p", (void *) vaddr, (void *) VM_MIN_KERNEL_ADDRESS);
1537 return KERN_FAILURE;
1538 }
1539
1540 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1541
1542 if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1543 panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1544 }
1545 if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1546 panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1547 }
1548
1549 /* Set up the protection bits, and block bits so we can validate block mappings. */
1550 if (new_prot & VM_PROT_WRITE) {
1551 arm_prot |= ARM_PTE_AP(AP_RWNA);
1552 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RWNA);
1553 } else {
1554 arm_prot |= ARM_PTE_AP(AP_RONA);
1555 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RONA);
1556 }
1557
1558 arm_prot |= ARM_PTE_NX;
1559 arm_block_prot |= ARM_TTE_BLOCK_NX;
1560
1561 if (!(new_prot & VM_PROT_EXECUTE)) {
1562 arm_prot |= ARM_PTE_PNX;
1563 arm_block_prot |= ARM_TTE_BLOCK_PNX;
1564 }
1565
1566 for (vaddr_cur = vaddr;
1567 vaddr_cur < trunc_page_64(vaddr + size);
1568 vaddr_cur += PAGE_SIZE) {
1569 ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1570 if (ppn != (vm_offset_t) NULL) {
1571 tt_entry_t *tte2;
1572 pt_entry_t *pte_p;
1573 pt_entry_t ptmp;
1574
1575 #if XNU_MONITOR
1576 assert(!pmap_is_monitor(ppn));
1577 assert(!TEST_PAGE_RATIO_4);
1578 #endif
1579
1580 tte2 = arm_kva_to_tte(vaddr_cur);
1581
1582 if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1583 if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1584 ((*tte2 & (ARM_TTE_BLOCK_NXMASK | ARM_TTE_BLOCK_PNXMASK | ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1585 /*
1586 * We can support ml_static_protect on a block mapping if the mapping already has
1587 * the desired protections. We still want to run checks on a per-page basis.
1588 */
1589 continue;
1590 }
1591
1592 result = KERN_FAILURE;
1593 break;
1594 }
1595
1596 pte_p = (pt_entry_t *)&((tt_entry_t*)(phystokv((*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1597 ptmp = *pte_p;
1598
1599 if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot)) {
1600 /*
1601 * The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
1602 * protections do not match the desired protections, then we will fail (as we cannot update
1603 * this mapping without updating other mappings as well).
1604 */
1605 result = KERN_FAILURE;
1606 break;
1607 }
1608
1609 __unreachable_ok_push
1610 if (TEST_PAGE_RATIO_4) {
1611 {
1612 unsigned int i;
1613 pt_entry_t *ptep_iter;
1614
1615 ptep_iter = pte_p;
1616 for (i = 0; i < 4; i++, ptep_iter++) {
1617 /* Note that there is a hole in the HINT sanity checking here. */
1618 ptmp = *ptep_iter;
1619
1620 /* We only need to update the page tables if the protections do not match. */
1621 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1622 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1623 *ptep_iter = ptmp;
1624 }
1625 }
1626 }
1627 } else {
1628 ptmp = *pte_p;
1629 /* We only need to update the page tables if the protections do not match. */
1630 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1631 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1632 *pte_p = ptmp;
1633 }
1634 }
1635 __unreachable_ok_pop
1636 }
1637 }
1638
1639 if (vaddr_cur > vaddr) {
1640 assert(((vaddr_cur - vaddr) & 0xFFFFFFFF00000000ULL) == 0);
1641 flush_mmu_tlb_region(vaddr, (uint32_t)(vaddr_cur - vaddr));
1642 }
1643
1644
1645 return result;
1646 }
1647
1648 /*
1649 * Routine: ml_static_mfree
1650 * Function:
1651 */
1652 void
ml_static_mfree(vm_offset_t vaddr,vm_size_t size)1653 ml_static_mfree(
1654 vm_offset_t vaddr,
1655 vm_size_t size)
1656 {
1657 vm_offset_t vaddr_cur;
1658 ppnum_t ppn;
1659 uint32_t freed_pages = 0;
1660 uint32_t bad_page_cnt = 0;
1661 uint32_t freed_kernelcache_pages = 0;
1662
1663 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
1664 /* For testing hitting a bad ram page */
1665 static int count = 0;
1666 static int bad_at_cnt = -1;
1667 static bool first = true;
1668
1669 if (first) {
1670 (void)PE_parse_boot_argn("bad_static_mfree", &bad_at_cnt, sizeof(bad_at_cnt));
1671 first = false;
1672 }
1673 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
1674
1675 /* It is acceptable (if bad) to fail to free. */
1676 if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1677 return;
1678 }
1679
1680 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1681
1682 for (vaddr_cur = vaddr;
1683 vaddr_cur < trunc_page_64(vaddr + size);
1684 vaddr_cur += PAGE_SIZE) {
1685 ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1686 if (ppn != (vm_offset_t) NULL) {
1687 /*
1688 * It is not acceptable to fail to update the protections on a page
1689 * we will release to the VM. We need to either panic or continue.
1690 * For now, we'll panic (to help flag if there is memory we can
1691 * reclaim).
1692 */
1693 if (ml_static_protect(vaddr_cur, PAGE_SIZE, VM_PROT_WRITE | VM_PROT_READ) != KERN_SUCCESS) {
1694 panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
1695 }
1696
1697 #if defined(__arm64__)
1698 bool is_bad = pmap_is_bad_ram(ppn);
1699 #if DEVELOPMENT || DEBUG
1700 is_bad |= (count++ == bad_at_cnt);
1701 #endif /* DEVELOPMENT || DEBUG */
1702
1703 if (is_bad) {
1704 ++bad_page_cnt;
1705 vm_page_create_retired(ppn);
1706 continue;
1707 }
1708 #endif /* defined(__arm64__) */
1709
1710 vm_page_create(ppn, (ppn + 1));
1711 freed_pages++;
1712 if (vaddr_cur >= segLOWEST && vaddr_cur < end_kern) {
1713 freed_kernelcache_pages++;
1714 }
1715 }
1716 }
1717 vm_page_lockspin_queues();
1718 vm_page_wire_count -= freed_pages;
1719 vm_page_wire_count_initial -= freed_pages;
1720 vm_page_kernelcache_count -= freed_kernelcache_pages;
1721 vm_page_unlock_queues();
1722 #if DEBUG
1723 kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
1724 #endif
1725 }
1726
1727
1728 /* virtual to physical on wired pages */
1729 vm_offset_t
ml_vtophys(vm_offset_t vaddr)1730 ml_vtophys(vm_offset_t vaddr)
1731 {
1732 return kvtophys(vaddr);
1733 }
1734
1735 /*
1736 * Routine: ml_nofault_copy
1737 * Function: Perform a physical mode copy if the source and destination have
1738 * valid translations in the kernel pmap. If translations are present, they are
1739 * assumed to be wired; e.g., no attempt is made to guarantee that the
1740 * translations obtained remain valid for the duration of the copy process.
1741 */
1742 vm_size_t
ml_nofault_copy(vm_offset_t virtsrc,vm_offset_t virtdst,vm_size_t size)1743 ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
1744 {
1745 addr64_t cur_phys_dst, cur_phys_src;
1746 vm_size_t count, nbytes = 0;
1747
1748 while (size > 0) {
1749 if (!(cur_phys_src = kvtophys(virtsrc))) {
1750 break;
1751 }
1752 if (!(cur_phys_dst = kvtophys(virtdst))) {
1753 break;
1754 }
1755 if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) ||
1756 !pmap_valid_address(trunc_page_64(cur_phys_src))) {
1757 break;
1758 }
1759 count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
1760 if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
1761 count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
1762 }
1763 if (count > size) {
1764 count = size;
1765 }
1766
1767 bcopy_phys(cur_phys_src, cur_phys_dst, count);
1768
1769 nbytes += count;
1770 virtsrc += count;
1771 virtdst += count;
1772 size -= count;
1773 }
1774
1775 return nbytes;
1776 }
1777
1778 /*
1779 * Routine: ml_validate_nofault
1780 * Function: Validate that ths address range has a valid translations
1781 * in the kernel pmap. If translations are present, they are
1782 * assumed to be wired; i.e. no attempt is made to guarantee
1783 * that the translation persist after the check.
1784 * Returns: TRUE if the range is mapped and will not cause a fault,
1785 * FALSE otherwise.
1786 */
1787
1788 boolean_t
ml_validate_nofault(vm_offset_t virtsrc,vm_size_t size)1789 ml_validate_nofault(
1790 vm_offset_t virtsrc, vm_size_t size)
1791 {
1792 addr64_t cur_phys_src;
1793 uint32_t count;
1794
1795 while (size > 0) {
1796 if (!(cur_phys_src = kvtophys(virtsrc))) {
1797 return FALSE;
1798 }
1799 if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
1800 return FALSE;
1801 }
1802 count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
1803 if (count > size) {
1804 count = (uint32_t)size;
1805 }
1806
1807 virtsrc += count;
1808 size -= count;
1809 }
1810
1811 return TRUE;
1812 }
1813
1814 void
ml_get_bouncepool_info(vm_offset_t * phys_addr,vm_size_t * size)1815 ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
1816 {
1817 *phys_addr = 0;
1818 *size = 0;
1819 }
1820
1821 void
active_rt_threads(__unused boolean_t active)1822 active_rt_threads(__unused boolean_t active)
1823 {
1824 }
1825
1826 static void
cpu_qos_cb_default(__unused int urgency,__unused uint64_t qos_param1,__unused uint64_t qos_param2)1827 cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
1828 {
1829 return;
1830 }
1831
1832 cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
1833
1834 void
cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)1835 cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
1836 {
1837 if (cpu_qos_cb != NULL) {
1838 cpu_qos_update = cpu_qos_cb;
1839 } else {
1840 cpu_qos_update = cpu_qos_cb_default;
1841 }
1842 }
1843
1844 void
thread_tell_urgency(thread_urgency_t urgency,uint64_t rt_period,uint64_t rt_deadline,uint64_t sched_latency __unused,__unused thread_t nthread)1845 thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
1846 {
1847 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
1848
1849 cpu_qos_update((int)urgency, rt_period, rt_deadline);
1850
1851 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
1852 }
1853
1854 void
machine_run_count(__unused uint32_t count)1855 machine_run_count(__unused uint32_t count)
1856 {
1857 }
1858
1859 processor_t
machine_choose_processor(__unused processor_set_t pset,processor_t processor)1860 machine_choose_processor(__unused processor_set_t pset, processor_t processor)
1861 {
1862 return processor;
1863 }
1864
1865 #if KASAN
1866 vm_offset_t ml_stack_base(void);
1867 vm_size_t ml_stack_size(void);
1868
1869 vm_offset_t
ml_stack_base(void)1870 ml_stack_base(void)
1871 {
1872 uintptr_t local = (uintptr_t) &local;
1873 vm_offset_t intstack_top_ptr;
1874
1875 intstack_top_ptr = getCpuDatap()->intstack_top;
1876 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1877 return intstack_top_ptr - INTSTACK_SIZE;
1878 } else {
1879 return current_thread()->kernel_stack;
1880 }
1881 }
1882 vm_size_t
ml_stack_size(void)1883 ml_stack_size(void)
1884 {
1885 uintptr_t local = (uintptr_t) &local;
1886 vm_offset_t intstack_top_ptr;
1887
1888 intstack_top_ptr = getCpuDatap()->intstack_top;
1889 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1890 return INTSTACK_SIZE;
1891 } else {
1892 return kernel_stack_size;
1893 }
1894 }
1895 #endif
1896
1897 #ifdef CONFIG_KCOV
1898
1899 kcov_cpu_data_t *
current_kcov_data(void)1900 current_kcov_data(void)
1901 {
1902 return ¤t_cpu_datap()->cpu_kcov_data;
1903 }
1904
1905 kcov_cpu_data_t *
cpu_kcov_data(int cpuid)1906 cpu_kcov_data(int cpuid)
1907 {
1908 return &cpu_datap(cpuid)->cpu_kcov_data;
1909 }
1910
1911 #endif /* CONFIG_KCOV */
1912
1913 boolean_t
machine_timeout_suspended(void)1914 machine_timeout_suspended(void)
1915 {
1916 return FALSE;
1917 }
1918
1919 kern_return_t
ml_interrupt_prewarm(__unused uint64_t deadline)1920 ml_interrupt_prewarm(__unused uint64_t deadline)
1921 {
1922 return KERN_FAILURE;
1923 }
1924
1925 /*
1926 * Assumes fiq, irq disabled.
1927 */
1928 void
ml_set_decrementer(uint32_t dec_value)1929 ml_set_decrementer(uint32_t dec_value)
1930 {
1931 cpu_data_t *cdp = getCpuDatap();
1932
1933 assert(ml_get_interrupts_enabled() == FALSE);
1934 cdp->cpu_decrementer = dec_value;
1935
1936 if (cdp->cpu_set_decrementer_func) {
1937 cdp->cpu_set_decrementer_func(dec_value);
1938 } else {
1939 __builtin_arm_wsr64("CNTV_TVAL_EL0", (uint64_t)dec_value);
1940 }
1941 }
1942
1943
1944 uint64_t
ml_get_hwclock()1945 ml_get_hwclock()
1946 {
1947 uint64_t timebase;
1948
1949 #if __ARM_ARCH_8_6__
1950 timebase = __builtin_arm_rsr64("CNTVCTSS_EL0");
1951 #else
1952 // ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
1953 // "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
1954 // to other instructions executed on the same processor."
1955 __builtin_arm_isb(ISB_SY);
1956 timebase = __builtin_arm_rsr64("CNTVCT_EL0");
1957 #endif
1958
1959 return timebase;
1960 }
1961
1962 uint64_t
ml_get_timebase()1963 ml_get_timebase()
1964 {
1965 uint64_t clock, timebase;
1966
1967 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
1968 do {
1969 timebase = getCpuDatap()->cpu_base_timebase;
1970 os_compiler_barrier();
1971 clock = ml_get_hwclock();
1972 os_compiler_barrier();
1973 } while (getCpuDatap()->cpu_base_timebase != timebase);
1974
1975 return clock + timebase;
1976 }
1977
1978 /*
1979 * Get the speculative timebase without an ISB.
1980 */
1981 uint64_t
ml_get_speculative_timebase()1982 ml_get_speculative_timebase()
1983 {
1984 uint64_t clock, timebase;
1985
1986 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
1987 do {
1988 timebase = getCpuDatap()->cpu_base_timebase;
1989 os_compiler_barrier();
1990 clock = __builtin_arm_rsr64("CNTVCT_EL0");
1991
1992 os_compiler_barrier();
1993 } while (getCpuDatap()->cpu_base_timebase != timebase);
1994
1995 return clock + timebase;
1996 }
1997
1998 uint64_t
ml_get_timebase_entropy(void)1999 ml_get_timebase_entropy(void)
2000 {
2001 return ml_get_speculative_timebase();
2002 }
2003
2004 uint32_t
ml_get_decrementer()2005 ml_get_decrementer()
2006 {
2007 cpu_data_t *cdp = getCpuDatap();
2008 uint32_t dec;
2009
2010 assert(ml_get_interrupts_enabled() == FALSE);
2011
2012 if (cdp->cpu_get_decrementer_func) {
2013 dec = cdp->cpu_get_decrementer_func();
2014 } else {
2015 uint64_t wide_val;
2016
2017 wide_val = __builtin_arm_rsr64("CNTV_TVAL_EL0");
2018 dec = (uint32_t)wide_val;
2019 assert(wide_val == (uint64_t)dec);
2020 }
2021
2022 return dec;
2023 }
2024
2025 boolean_t
ml_get_timer_pending()2026 ml_get_timer_pending()
2027 {
2028 uint64_t cntv_ctl = __builtin_arm_rsr64("CNTV_CTL_EL0");
2029 return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
2030 }
2031
2032 __attribute__((noreturn))
2033 void
platform_syscall(arm_saved_state_t * state)2034 platform_syscall(arm_saved_state_t *state)
2035 {
2036 uint32_t code;
2037
2038 #define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2039
2040 code = (uint32_t)get_saved_state_reg(state, 3);
2041
2042 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_START,
2043 get_saved_state_reg(state, 0),
2044 get_saved_state_reg(state, 1),
2045 get_saved_state_reg(state, 2));
2046
2047 switch (code) {
2048 case 2:
2049 /* set cthread */
2050 platform_syscall_kprintf("set cthread self.\n");
2051 thread_set_cthread_self(get_saved_state_reg(state, 0));
2052 break;
2053 case 3:
2054 /* get cthread */
2055 platform_syscall_kprintf("get cthread self.\n");
2056 set_saved_state_reg(state, 0, thread_get_cthread_self());
2057 break;
2058 case 0: /* I-Cache flush (removed) */
2059 case 1: /* D-Cache flush (removed) */
2060 default:
2061 platform_syscall_kprintf("unknown: %d\n", code);
2062 break;
2063 }
2064
2065 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_END,
2066 get_saved_state_reg(state, 0));
2067
2068 thread_exception_return();
2069 }
2070
2071 static void
_enable_timebase_event_stream(uint32_t bit_index)2072 _enable_timebase_event_stream(uint32_t bit_index)
2073 {
2074 uint64_t cntkctl; /* One wants to use 32 bits, but "mrs" prefers it this way */
2075
2076 if (bit_index >= 64) {
2077 panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2078 }
2079
2080 __asm__ volatile ("mrs %0, CNTKCTL_EL1" : "=r"(cntkctl));
2081
2082 cntkctl |= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2083 cntkctl |= CNTKCTL_EL1_EVNTEN;
2084 cntkctl |= CNTKCTL_EL1_EVENTDIR; /* 1->0; why not? */
2085
2086 /*
2087 * If the SOC supports it (and it isn't broken), enable
2088 * EL0 access to the timebase registers.
2089 */
2090 if (user_timebase_type() != USER_TIMEBASE_NONE) {
2091 cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2092 }
2093
2094 __builtin_arm_wsr64("CNTKCTL_EL1", cntkctl);
2095 }
2096
2097 /*
2098 * Turn timer on, unmask that interrupt.
2099 */
2100 static void
_enable_virtual_timer(void)2101 _enable_virtual_timer(void)
2102 {
2103 uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
2104
2105 __builtin_arm_wsr64("CNTV_CTL_EL0", cntvctl);
2106 /* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
2107 __builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2108 }
2109
2110 void
fiq_context_init(boolean_t enable_fiq __unused)2111 fiq_context_init(boolean_t enable_fiq __unused)
2112 {
2113 /* Interrupts still disabled. */
2114 assert(ml_get_interrupts_enabled() == FALSE);
2115 _enable_virtual_timer();
2116 }
2117
2118 void
wfe_timeout_init(void)2119 wfe_timeout_init(void)
2120 {
2121 _enable_timebase_event_stream(arm64_eventi);
2122 }
2123
2124 /**
2125 * Configures, but does not enable, the WFE event stream. The event stream
2126 * generates an event at a set interval to act as a timeout for WFEs.
2127 *
2128 * This function sets the static global variable arm64_eventi to be the proper
2129 * bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2130 * period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2131 * is used by wfe_timeout_init to actually poke the registers and enable the
2132 * event stream.
2133 *
2134 * The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2135 * is the trigger for the system to generate an event. The trigger can occur on
2136 * either the rising or falling edge of the bit depending on the value of
2137 * CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2138 * falling edge (1->0) transition to generate events.
2139 */
2140 void
wfe_timeout_configure(void)2141 wfe_timeout_configure(void)
2142 {
2143 /* Could fill in our own ops here, if we needed them */
2144 uint64_t ticks_per_sec, ticks_per_event, events_per_sec = 0;
2145 uint32_t bit_index;
2146
2147 if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
2148 if (events_per_sec <= 0) {
2149 events_per_sec = 1;
2150 } else if (events_per_sec > USEC_PER_SEC) {
2151 events_per_sec = USEC_PER_SEC;
2152 }
2153 } else {
2154 events_per_sec = USEC_PER_SEC;
2155 }
2156 ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2157 ticks_per_event = ticks_per_sec / events_per_sec;
2158
2159 /* Bit index of next power of two greater than ticks_per_event */
2160 bit_index = flsll(ticks_per_event) - 1;
2161 /* Round up to next power of two if ticks_per_event is initially power of two */
2162 if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
2163 bit_index++;
2164 }
2165
2166 /*
2167 * The timer can only trigger on rising or falling edge, not both; we don't
2168 * care which we trigger on, but we do need to adjust which bit we are
2169 * interested in to account for this.
2170 *
2171 * In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2172 * falling edge of the given bit. Therefore, we must decrement the bit index
2173 * by one as when the bit before the one we care about makes a 1 -> 0
2174 * transition, the bit we care about makes a 0 -> 1 transition.
2175 *
2176 * For example if we want an event generated every 8 ticks (if we calculated
2177 * a bit_index of 3), we would want the event to be generated whenever the
2178 * lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2179 * see that the bit at index 2 makes a falling transition in this scenario,
2180 * so we would want EVENTI to be 2 instead of 3.
2181 */
2182 if (bit_index != 0) {
2183 bit_index--;
2184 }
2185
2186 arm64_eventi = bit_index;
2187 }
2188
2189 boolean_t
ml_delay_should_spin(uint64_t interval)2190 ml_delay_should_spin(uint64_t interval)
2191 {
2192 cpu_data_t *cdp = getCpuDatap();
2193
2194 if (cdp->cpu_idle_latency) {
2195 return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2196 } else {
2197 /*
2198 * Early boot, latency is unknown. Err on the side of blocking,
2199 * which should always be safe, even if slow
2200 */
2201 return FALSE;
2202 }
2203 }
2204
2205 boolean_t
ml_thread_is64bit(thread_t thread)2206 ml_thread_is64bit(thread_t thread)
2207 {
2208 return thread_is_64bit_addr(thread);
2209 }
2210
2211 void
ml_delay_on_yield(void)2212 ml_delay_on_yield(void)
2213 {
2214 #if DEVELOPMENT || DEBUG
2215 if (yield_delay_us) {
2216 delay(yield_delay_us);
2217 }
2218 #endif
2219 }
2220
2221 void
ml_timer_evaluate(void)2222 ml_timer_evaluate(void)
2223 {
2224 }
2225
2226 boolean_t
ml_timer_forced_evaluation(void)2227 ml_timer_forced_evaluation(void)
2228 {
2229 return FALSE;
2230 }
2231
2232 uint64_t
ml_energy_stat(thread_t t)2233 ml_energy_stat(thread_t t)
2234 {
2235 return t->machine.energy_estimate_nj;
2236 }
2237
2238
2239 void
ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)2240 ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2241 {
2242 /*
2243 * For now: update the resource coalition stats of the
2244 * current thread's coalition
2245 */
2246 task_coalition_update_gpu_stats(current_task(), gpu_ns_delta);
2247 }
2248
2249 uint64_t
ml_gpu_stat(__unused thread_t t)2250 ml_gpu_stat(__unused thread_t t)
2251 {
2252 return 0;
2253 }
2254
2255 #if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME || HAS_FAST_CNTVCT
2256
2257 static void
timer_state_event(boolean_t switch_to_kernel)2258 timer_state_event(boolean_t switch_to_kernel)
2259 {
2260 thread_t thread = current_thread();
2261 if (!thread->precise_user_kernel_time) {
2262 return;
2263 }
2264
2265 processor_t pd = current_processor();
2266 uint64_t now = ml_get_speculative_timebase();
2267
2268 timer_stop(pd->current_state, now);
2269 pd->current_state = (switch_to_kernel) ? &pd->system_state : &pd->user_state;
2270 timer_start(pd->current_state, now);
2271
2272 timer_stop(pd->thread_timer, now);
2273 pd->thread_timer = (switch_to_kernel) ? &thread->system_timer : &thread->user_timer;
2274 timer_start(pd->thread_timer, now);
2275 }
2276
2277 void
timer_state_event_user_to_kernel(void)2278 timer_state_event_user_to_kernel(void)
2279 {
2280 timer_state_event(TRUE);
2281 }
2282
2283 void
timer_state_event_kernel_to_user(void)2284 timer_state_event_kernel_to_user(void)
2285 {
2286 timer_state_event(FALSE);
2287 }
2288 #endif /* !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME || HAS_FAST_CNTVCT */
2289
2290 thread_t
current_thread(void)2291 current_thread(void)
2292 {
2293 return current_thread_fast();
2294 }
2295
2296 typedef struct{
2297 ex_cb_t cb;
2298 void *refcon;
2299 }
2300 ex_cb_info_t;
2301
2302 ex_cb_info_t ex_cb_info[EXCB_CLASS_MAX];
2303
2304 /*
2305 * Callback registration
2306 * Currently we support only one registered callback per class but
2307 * it should be possible to support more callbacks
2308 */
2309 kern_return_t
ex_cb_register(ex_cb_class_t cb_class,ex_cb_t cb,void * refcon)2310 ex_cb_register(
2311 ex_cb_class_t cb_class,
2312 ex_cb_t cb,
2313 void *refcon)
2314 {
2315 ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2316
2317 if ((NULL == cb) || (cb_class >= EXCB_CLASS_MAX)) {
2318 return KERN_INVALID_VALUE;
2319 }
2320
2321 if (NULL == pInfo->cb) {
2322 pInfo->cb = cb;
2323 pInfo->refcon = refcon;
2324 return KERN_SUCCESS;
2325 }
2326 return KERN_FAILURE;
2327 }
2328
2329 /*
2330 * Called internally by platform kernel to invoke the registered callback for class
2331 */
2332 ex_cb_action_t
ex_cb_invoke(ex_cb_class_t cb_class,vm_offset_t far)2333 ex_cb_invoke(
2334 ex_cb_class_t cb_class,
2335 vm_offset_t far)
2336 {
2337 ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2338 ex_cb_state_t state = {far};
2339
2340 if (cb_class >= EXCB_CLASS_MAX) {
2341 panic("Invalid exception callback class 0x%x", cb_class);
2342 }
2343
2344 if (pInfo->cb) {
2345 return pInfo->cb(cb_class, pInfo->refcon, &state);
2346 }
2347 return EXCB_ACTION_NONE;
2348 }
2349
2350 #if defined(HAS_APPLE_PAC)
2351 void
ml_task_set_disable_user_jop(task_t task,uint8_t disable_user_jop)2352 ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2353 {
2354 assert(task);
2355 task->disable_user_jop = disable_user_jop;
2356 }
2357
2358 void
ml_thread_set_disable_user_jop(thread_t thread,uint8_t disable_user_jop)2359 ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2360 {
2361 assert(thread);
2362 thread->machine.disable_user_jop = disable_user_jop;
2363 }
2364
2365 void
ml_task_set_rop_pid(task_t task,task_t parent_task,boolean_t inherit)2366 ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2367 {
2368 if (inherit) {
2369 task->rop_pid = parent_task->rop_pid;
2370 } else {
2371 task->rop_pid = early_random();
2372 }
2373 }
2374
2375 /**
2376 * jop_pid may be inherited from the parent task or generated inside the shared
2377 * region. Unfortunately these two parameters are available at very different
2378 * times during task creation, so we need to split this into two steps.
2379 */
2380 void
ml_task_set_jop_pid(task_t task,task_t parent_task,boolean_t inherit)2381 ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit)
2382 {
2383 if (inherit) {
2384 task->jop_pid = parent_task->jop_pid;
2385 } else {
2386 task->jop_pid = ml_default_jop_pid();
2387 }
2388 }
2389
2390 void
ml_task_set_jop_pid_from_shared_region(task_t task)2391 ml_task_set_jop_pid_from_shared_region(task_t task)
2392 {
2393 vm_shared_region_t sr = vm_shared_region_get(task);
2394 /*
2395 * If there's no shared region, we can assign the key arbitrarily. This
2396 * typically happens when Mach-O image activation failed part of the way
2397 * through, and this task is in the middle of dying with SIGKILL anyway.
2398 */
2399 if (__improbable(!sr)) {
2400 task->jop_pid = early_random();
2401 return;
2402 }
2403 vm_shared_region_deallocate(sr);
2404
2405 /*
2406 * Similarly we have to worry about jetsam having killed the task and
2407 * already cleared the shared_region_id.
2408 */
2409 task_lock(task);
2410 if (task->shared_region_id != NULL) {
2411 task->jop_pid = shared_region_find_key(task->shared_region_id);
2412 } else {
2413 task->jop_pid = early_random();
2414 }
2415 task_unlock(task);
2416 }
2417
2418 void
ml_thread_set_jop_pid(thread_t thread,task_t task)2419 ml_thread_set_jop_pid(thread_t thread, task_t task)
2420 {
2421 thread->machine.jop_pid = task->jop_pid;
2422 }
2423 #endif /* defined(HAS_APPLE_PAC) */
2424
2425 #if defined(HAS_APPLE_PAC)
2426 #ifdef __ARM_ARCH_8_6__
2427 /**
2428 * Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2429 */
2430 static void *
ml_poison_ptr(void * ptr,ptrauth_key key)2431 ml_poison_ptr(void *ptr, ptrauth_key key)
2432 {
2433 bool b_key = key & (1ULL << 0);
2434 uint64_t error_code;
2435 if (b_key) {
2436 error_code = 2;
2437 } else {
2438 error_code = 1;
2439 }
2440
2441 bool kernel_pointer = (uintptr_t)ptr & (1ULL << 55);
2442 bool data_key = key & (1ULL << 1);
2443 /* When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters */
2444 bool tbi = data_key && !kernel_pointer;
2445 unsigned int poison_shift;
2446 if (tbi) {
2447 poison_shift = 53;
2448 } else {
2449 poison_shift = 61;
2450 }
2451
2452 uintptr_t poisoned = (uintptr_t)ptr;
2453 poisoned &= ~(3ULL << poison_shift);
2454 poisoned |= error_code << poison_shift;
2455 return (void *)poisoned;
2456 }
2457
2458 /*
2459 * ptrauth_sign_unauthenticated() reimplemented using asm volatile, forcing the
2460 * compiler to assume this operation has side-effects and cannot be reordered
2461 */
2462 #define ptrauth_sign_volatile(__value, __suffix, __data) \
2463 ({ \
2464 void *__ret = __value; \
2465 asm volatile ( \
2466 "pac" #__suffix " %[value], %[data]" \
2467 : [value] "+r"(__ret) \
2468 : [data] "r"(__data) \
2469 ); \
2470 __ret; \
2471 })
2472
2473 #define ml_auth_ptr_unchecked_for_key(_ptr, _suffix, _key, _modifier) \
2474 do { \
2475 void *stripped = ptrauth_strip(_ptr, _key); \
2476 void *reauthed = ptrauth_sign_volatile(stripped, _suffix, _modifier); \
2477 if (__probable(_ptr == reauthed)) { \
2478 _ptr = stripped; \
2479 } else { \
2480 _ptr = ml_poison_ptr(stripped, _key); \
2481 } \
2482 } while (0)
2483
2484 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2485 ml_auth_ptr_unchecked_for_key(_ptr, _suffix, ptrauth_key_as ## _suffix, _modifier)
2486 #else
2487 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2488 asm volatile ("aut" #_suffix " %[ptr], %[modifier]" : [ptr] "+r"(_ptr) : [modifier] "r"(_modifier));
2489 #endif /* __ARM_ARCH_8_6__ */
2490
2491 /**
2492 * Authenticates a signed pointer without trapping on failure.
2493 *
2494 * @warning This function must be called with interrupts disabled.
2495 *
2496 * @warning Pointer authentication failure should normally be treated as a fatal
2497 * error. This function is intended for a handful of callers that cannot panic
2498 * on failure, and that understand the risks in handling a poisoned return
2499 * value. Other code should generally use the trapping variant
2500 * ptrauth_auth_data() instead.
2501 *
2502 * @param ptr the pointer to authenticate
2503 * @param key which key to use for authentication
2504 * @param modifier a modifier to mix into the key
2505 * @return an authenticated version of ptr, possibly with poison bits set
2506 */
2507 void *
ml_auth_ptr_unchecked(void * ptr,ptrauth_key key,uint64_t modifier)2508 ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier)
2509 {
2510 switch (key & 0x3) {
2511 case ptrauth_key_asia:
2512 _ml_auth_ptr_unchecked(ptr, ia, modifier);
2513 break;
2514 case ptrauth_key_asib:
2515 _ml_auth_ptr_unchecked(ptr, ib, modifier);
2516 break;
2517 case ptrauth_key_asda:
2518 _ml_auth_ptr_unchecked(ptr, da, modifier);
2519 break;
2520 case ptrauth_key_asdb:
2521 _ml_auth_ptr_unchecked(ptr, db, modifier);
2522 break;
2523 }
2524
2525 return ptr;
2526 }
2527 #endif /* defined(HAS_APPLE_PAC) */
2528
2529 #ifdef CONFIG_XNUPOST
2530 void
ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_addr)2531 ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2532 {
2533 thread_t thread = current_thread();
2534 thread->machine.expected_fault_handler = expected_fault_handler;
2535 thread->machine.expected_fault_addr = expected_fault_addr;
2536 }
2537
2538 void
ml_expect_fault_end(void)2539 ml_expect_fault_end(void)
2540 {
2541 thread_t thread = current_thread();
2542 thread->machine.expected_fault_handler = NULL;
2543 thread->machine.expected_fault_addr = 0;
2544 }
2545 #endif /* CONFIG_XNUPOST */
2546
2547 void
ml_hibernate_active_pre(void)2548 ml_hibernate_active_pre(void)
2549 {
2550 #if HIBERNATION
2551 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2552
2553 hibernate_rebuild_vm_structs();
2554 }
2555 #endif /* HIBERNATION */
2556 }
2557
2558 void
ml_hibernate_active_post(void)2559 ml_hibernate_active_post(void)
2560 {
2561 #if HIBERNATION
2562 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2563 hibernate_machine_init();
2564 hibernate_vm_lock_end();
2565 current_cpu_datap()->cpu_hibernate = 0;
2566 }
2567 #endif /* HIBERNATION */
2568 }
2569
2570 /**
2571 * Return back a machine-dependent array of address space regions that should be
2572 * reserved by the VM (pre-mapped in the address space). This will prevent user
2573 * processes from allocating or deallocating from within these regions.
2574 *
2575 * @param vm_is64bit True if the process has a 64-bit address space.
2576 * @param regions An out parameter representing an array of regions to reserve.
2577 *
2578 * @return The number of reserved regions returned through `regions`.
2579 */
2580 size_t
ml_get_vm_reserved_regions(bool vm_is64bit,struct vm_reserved_region ** regions)2581 ml_get_vm_reserved_regions(bool vm_is64bit, struct vm_reserved_region **regions)
2582 {
2583 assert(regions != NULL);
2584
2585 /**
2586 * Reserved regions only apply to 64-bit address spaces. This is because
2587 * we only expect to grow the maximum user VA address on 64-bit address spaces
2588 * (we've essentially already reached the max for 32-bit spaces). The reserved
2589 * regions should safely fall outside of the max user VA for 32-bit processes.
2590 */
2591 if (vm_is64bit) {
2592 *regions = vm_reserved_regions;
2593 return ARRAY_COUNT(vm_reserved_regions);
2594 } else {
2595 /* Don't reserve any VA regions on arm64_32 processes. */
2596 *regions = NULL;
2597 return 0;
2598 }
2599 }
2600 /* These WFE recommendations are expected to be updated on a relatively
2601 * infrequent cadence, possibly from a different cluster, hence
2602 * false cacheline sharing isn't expected to be material
2603 */
2604 static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
2605
2606 uint32_t
ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id,uint64_t wfe_timeout_abstime_interval,__unused uint64_t wfe_hint_flags)2607 ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
2608 {
2609 assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
2610 assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
2611 os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
2612 return 0; /* Success */
2613 }
2614
2615 #if DEVELOPMENT || DEBUG
2616 int wfe_rec_max = 0;
2617 int wfe_rec_none = 0;
2618 uint64_t wfe_rec_override_mat = 0;
2619 uint64_t wfe_rec_clamp = 0;
2620 #endif
2621
2622 uint64_t
ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)2623 ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
2624 {
2625 /* This and its consumer does not synchronize vis-a-vis updates
2626 * of the recommendation; races are acceptable.
2627 */
2628 uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
2629 #if DEVELOPMENT || DEBUG
2630 if (wfe_rec_clamp) {
2631 wfet = MIN(wfe_rec_clamp, wfet);
2632 }
2633
2634 if (wfe_rec_max) {
2635 for (int i = 0; i < MAX_CPU_CLUSTERS; i++) {
2636 if (arm64_cluster_wfe_recs[i] > wfet) {
2637 wfet = arm64_cluster_wfe_recs[i];
2638 }
2639 }
2640 }
2641
2642 if (wfe_rec_none) {
2643 wfet = 0;
2644 }
2645
2646 if (wfe_rec_override_mat) {
2647 wfet = wfe_rec_override_mat;
2648 }
2649 #endif
2650 return wfet;
2651 }
2652