1 /*
2 * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <arm64/machine_machdep.h>
30 #include <arm64/proc_reg.h>
31 #include <arm/machine_cpu.h>
32 #include <arm/cpu_internal.h>
33 #include <arm/cpuid.h>
34 #include <arm/cpu_data.h>
35 #include <arm/cpu_data_internal.h>
36 #include <arm/caches_internal.h>
37 #include <arm/misc_protos.h>
38 #include <arm/machdep_call.h>
39 #include <arm/machine_routines.h>
40 #include <arm/rtclock.h>
41 #include <arm/cpuid_internal.h>
42 #include <arm/cpu_capabilities.h>
43 #include <console/serial_protos.h>
44 #include <kern/machine.h>
45 #include <kern/misc_protos.h>
46 #include <prng/random.h>
47 #include <kern/startup.h>
48 #include <kern/thread.h>
49 #include <kern/timer_queue.h>
50 #include <mach/machine.h>
51 #include <machine/atomic.h>
52 #include <machine/config.h>
53 #include <vm/pmap.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_shared_region.h>
56 #include <vm/vm_map.h>
57 #include <sys/codesign.h>
58 #include <sys/kdebug.h>
59 #include <kern/coalition.h>
60 #include <pexpert/device_tree.h>
61
62 #include <IOKit/IOPlatformExpert.h>
63 #if HIBERNATION
64 #include <IOKit/IOHibernatePrivate.h>
65 #endif /* HIBERNATION */
66
67 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
68 #include <arm64/amcc_rorgn.h>
69 #endif
70
71
72 #include <libkern/section_keywords.h>
73
74 /**
75 * On supported hardware, debuggable builds make the HID bits read-only
76 * without locking them. This lets people manually modify HID bits while
77 * debugging, since they can use a debugging tool to first reset the HID
78 * bits back to read/write. However it will still catch xnu changes that
79 * accidentally write to HID bits after they've been made read-only.
80 */
81
82 #if KPC
83 #include <kern/kpc.h>
84 #endif
85
86 #define MPIDR_CPU_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
87 #define MPIDR_CLUSTER_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
88
89 #if HAS_CLUSTER
90 static uint8_t cluster_initialized = 0;
91 #endif
92
93 MACHINE_TIMEOUT_WRITEABLE(LockTimeOut, "lock", 6e6 /* 0.25s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
94 machine_timeout_t LockTimeOutUsec; // computed in ml_init_lock_timeout
95
96 MACHINE_TIMEOUT_WRITEABLE(TLockTimeOut, "ticket-lock", 3e6 /* 0.125s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
97
98 MACHINE_TIMEOUT_WRITEABLE(MutexSpin, "mutex-spin", 240 /* 10us */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
99
100 uint64_t low_MutexSpin;
101 int64_t high_MutexSpin;
102
103
104
105 static uint64_t ml_wfe_hint_max_interval;
106 #define MAX_WFE_HINT_INTERVAL_US (500ULL)
107
108 /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
109 TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
110
111 extern vm_offset_t segLOWEST;
112 extern vm_offset_t segLOWESTTEXT;
113 extern vm_offset_t segLASTB;
114 extern unsigned long segSizeLAST;
115
116 /* ARM64 specific bounds; used to test for presence in the kernelcache. */
117 extern vm_offset_t vm_kernelcache_base;
118 extern vm_offset_t vm_kernelcache_top;
119
120 extern vm_offset_t arm_vm_kernelcache_phys_start;
121 extern vm_offset_t arm_vm_kernelcache_phys_end;
122
123 #if defined(HAS_IPI)
124 unsigned int gFastIPI = 1;
125 #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
126 static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
127 kDeferredIPITimerDefault);
128 #endif /* defined(HAS_IPI) */
129
130 thread_t Idle_context(void);
131
132 SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
133
134 SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
135 SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
136 SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
137 .version = CPU_TOPOLOGY_VERSION,
138 .cpus = topology_cpu_array,
139 .clusters = topology_cluster_array,
140 };
141
142 _Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
143
144 /**
145 * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
146 * entries of an arbitrary data type. This is intended for use by specialized consumers
147 * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
148 * as follows:
149 * hypothetical_array[cluster_offsets[AFF1] + AFF0]
150 * Most consumers should instead use general-purpose facilities such as PERCPU or
151 * ml_get_cpu_number().
152 */
153 SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
154
155 SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
156
157 extern uint32_t lockdown_done;
158
159 /**
160 * Represents regions of virtual address space that should be reserved
161 * (pre-mapped) in each user address space.
162 */
163 SECURITY_READ_ONLY_LATE(static struct vm_reserved_region) vm_reserved_regions[] = {
164 {
165 .vmrr_name = "GPU Carveout",
166 .vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
167 .vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
168 },
169 /*
170 * Reserve the virtual memory space representing the commpage nesting region
171 * to prevent user processes from allocating memory within it. The actual
172 * page table entries for the commpage are inserted by vm_commpage_enter().
173 * This vm_map_enter() just prevents userspace from allocating/deallocating
174 * anything within the entire commpage nested region.
175 */
176 {
177 .vmrr_name = "commpage nesting",
178 .vmrr_addr = _COMM_PAGE64_NESTING_START,
179 .vmrr_size = _COMM_PAGE64_NESTING_SIZE
180 }
181 };
182
183 uint32_t get_arm_cpu_version(void);
184
185 #if defined(HAS_IPI)
186 static inline void
ml_cpu_signal_type(unsigned int cpu_mpidr,uint32_t type)187 ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
188 {
189 #if HAS_CLUSTER
190 uint64_t local_mpidr;
191 /* NOTE: this logic expects that we are called in a non-preemptible
192 * context, or at least one in which the calling thread is bound
193 * to a single CPU. Otherwise we may migrate between choosing which
194 * IPI mechanism to use and issuing the IPI. */
195 MRS(local_mpidr, "MPIDR_EL1");
196 if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
197 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
198 MSR("S3_5_C15_C0_0", x);
199 } else {
200 #define IPI_RR_TARGET_CLUSTER_SHIFT 16
201 uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
202 MSR("S3_5_C15_C0_1", x);
203 }
204 #else
205 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
206 MSR("S3_5_C15_C0_1", x);
207 #endif
208 }
209 #endif
210
211 #if !defined(HAS_IPI)
212 __dead2
213 #endif
214 void
ml_cpu_signal(unsigned int cpu_mpidr __unused)215 ml_cpu_signal(unsigned int cpu_mpidr __unused)
216 {
217 #if defined(HAS_IPI)
218 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
219 #else
220 panic("Platform does not support ACC Fast IPI");
221 #endif
222 }
223
224 #if !defined(HAS_IPI)
225 __dead2
226 #endif
227 void
ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)228 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
229 {
230 #if defined(HAS_IPI)
231 /* adjust IPI_CR timer countdown value for deferred IPI
232 * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
233 * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
234 *
235 * global register, should only require a single write to update all
236 * CPU cores: from Skye ACC user spec section 5.7.3.3
237 *
238 * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
239 * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
240 */
241 uint64_t abstime;
242
243 nanoseconds_to_absolutetime(nanosecs, &abstime);
244
245 abstime = MIN(abstime, 0xFFFF);
246
247 /* update deferred_ipi_timer_ns with the new clamped value */
248 absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
249
250 MSR("S3_5_C15_C3_1", abstime);
251 #else
252 (void)nanosecs;
253 panic("Platform does not support ACC Fast IPI");
254 #endif
255 }
256
257 uint64_t
ml_cpu_signal_deferred_get_timer()258 ml_cpu_signal_deferred_get_timer()
259 {
260 #if defined(HAS_IPI)
261 return deferred_ipi_timer_ns;
262 #else
263 return 0;
264 #endif
265 }
266
267 #if !defined(HAS_IPI)
268 __dead2
269 #endif
270 void
ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)271 ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
272 {
273 #if defined(HAS_IPI)
274 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
275 #else
276 panic("Platform does not support ACC Fast IPI deferral");
277 #endif
278 }
279
280 #if !defined(HAS_IPI)
281 __dead2
282 #endif
283 void
ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)284 ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
285 {
286 #if defined(HAS_IPI)
287 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
288 #else
289 panic("Platform does not support ACC Fast IPI retraction");
290 #endif
291 }
292
293 extern uint32_t idle_proximate_io_wfe_unmasked;
294
295 #define CPUPM_IDLE_WFE 0x5310300
296 static bool
wfe_process_recommendation(void)297 wfe_process_recommendation(void)
298 {
299 bool ipending = false;
300 if (__probable(idle_proximate_io_wfe_unmasked == 1)) {
301 /* Check for an active perf. controller generated
302 * WFE recommendation for this cluster.
303 */
304 cpu_data_t *cdp = getCpuDatap();
305 uint32_t cid = cdp->cpu_cluster_id;
306 uint64_t wfe_ttd = 0;
307 uint64_t wfe_deadline = 0;
308
309 if ((wfe_ttd = ml_cluster_wfe_timeout(cid)) != 0) {
310 wfe_deadline = mach_absolute_time() + wfe_ttd;
311 }
312
313 if (wfe_deadline != 0) {
314 /* Poll issuing event-bounded WFEs until an interrupt
315 * arrives or the WFE recommendation expires
316 */
317 #if DEVELOPMENT || DEBUG
318 uint64_t wc = cdp->wfe_count;
319 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
320 #endif
321 /* Issue WFE until the recommendation expires,
322 * with IRQs unmasked.
323 */
324 ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true);
325 #if DEVELOPMENT || DEBUG
326 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
327 #endif
328 }
329 }
330 return ipending;
331 }
332
333 void
machine_idle(void)334 machine_idle(void)
335 {
336 /* Interrupts are expected to be masked on entry or re-entry via
337 * Idle_load_context()
338 */
339 assert((__builtin_arm_rsr("DAIF") & (DAIF_IRQF | DAIF_FIQF)) == (DAIF_IRQF | DAIF_FIQF));
340 /* Check for, and act on, a WFE recommendation.
341 * Bypasses context spill/fill for a minor perf. increment.
342 * May unmask and restore IRQ+FIQ mask.
343 */
344 if (wfe_process_recommendation() == false) {
345 /* If WFE recommendation absent, or WFE deadline
346 * arrived with no interrupt pending/processed,
347 * fall back to WFI.
348 */
349 Idle_context();
350 }
351 __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
352 }
353
354 void
OSSynchronizeIO(void)355 OSSynchronizeIO(void)
356 {
357 __builtin_arm_dsb(DSB_SY);
358 }
359
360 uint64_t
get_aux_control(void)361 get_aux_control(void)
362 {
363 uint64_t value;
364
365 MRS(value, "ACTLR_EL1");
366 return value;
367 }
368
369 uint64_t
get_mmu_control(void)370 get_mmu_control(void)
371 {
372 uint64_t value;
373
374 MRS(value, "SCTLR_EL1");
375 return value;
376 }
377
378 uint64_t
get_tcr(void)379 get_tcr(void)
380 {
381 uint64_t value;
382
383 MRS(value, "TCR_EL1");
384 return value;
385 }
386
387 boolean_t
ml_get_interrupts_enabled(void)388 ml_get_interrupts_enabled(void)
389 {
390 uint64_t value;
391
392 MRS(value, "DAIF");
393 if (value & DAIF_IRQF) {
394 return FALSE;
395 }
396 return TRUE;
397 }
398
399 pmap_paddr_t
get_mmu_ttb(void)400 get_mmu_ttb(void)
401 {
402 pmap_paddr_t value;
403
404 MRS(value, "TTBR0_EL1");
405 return value;
406 }
407
408 uint32_t
get_arm_cpu_version(void)409 get_arm_cpu_version(void)
410 {
411 uint32_t value = machine_read_midr();
412
413 /* Compose the register values into 8 bits; variant[7:4], revision[3:0]. */
414 return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
415 }
416
417 bool
ml_feature_supported(uint32_t feature_bit)418 ml_feature_supported(uint32_t feature_bit)
419 {
420 uint64_t aidr_el1_value = 0;
421
422 MRS(aidr_el1_value, "AIDR_EL1");
423
424
425 return aidr_el1_value & feature_bit;
426 }
427
428 /*
429 * user_cont_hwclock_allowed()
430 *
431 * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
432 * as a continuous time source (e.g. from mach_continuous_time)
433 */
434 boolean_t
user_cont_hwclock_allowed(void)435 user_cont_hwclock_allowed(void)
436 {
437 #if HAS_CONTINUOUS_HWCLOCK
438 return TRUE;
439 #else
440 return FALSE;
441 #endif
442 }
443
444 /*
445 * user_timebase_type()
446 *
447 * Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
448 *
449 * USER_TIMEBASE_NONE: EL0 has no access to timebase register
450 * USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
451 * USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
452 *
453 */
454
455 uint8_t
user_timebase_type(void)456 user_timebase_type(void)
457 {
458 #if HAS_ACNTVCT
459 return USER_TIMEBASE_NOSPEC_APPLE;
460 #elif __ARM_ARCH_8_6__
461 return USER_TIMEBASE_NOSPEC;
462 #else
463 return USER_TIMEBASE_SPEC;
464 #endif
465 }
466
467 void
machine_startup(__unused boot_args * args)468 machine_startup(__unused boot_args * args)
469 {
470 #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
471 if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
472 gFastIPI = 1;
473 }
474 #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
475
476
477 machine_conf();
478
479
480 /*
481 * Kick off the kernel bootstrap.
482 */
483 kernel_bootstrap();
484 /* NOTREACHED */
485 }
486
487 typedef void (*invalidate_fn_t)(void);
488
489 static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
490
491 void set_invalidate_hmac_function(invalidate_fn_t fn);
492
493 void
set_invalidate_hmac_function(invalidate_fn_t fn)494 set_invalidate_hmac_function(invalidate_fn_t fn)
495 {
496 if (NULL != invalidate_hmac_function) {
497 panic("Invalidate HMAC function already set");
498 }
499
500 invalidate_hmac_function = fn;
501 }
502
503 void
machine_lockdown(void)504 machine_lockdown(void)
505 {
506 arm_vm_prot_finalize(PE_state.bootArgs);
507
508 #if CONFIG_KERNEL_INTEGRITY
509 #if KERNEL_INTEGRITY_WT
510 /* Watchtower
511 *
512 * Notify the monitor about the completion of early kernel bootstrap.
513 * From this point forward it will enforce the integrity of kernel text,
514 * rodata and page tables.
515 */
516
517 #ifdef MONITOR
518 monitor_call(MONITOR_LOCKDOWN, 0, 0, 0);
519 #endif
520 #endif /* KERNEL_INTEGRITY_WT */
521
522 #if XNU_MONITOR
523 pmap_lockdown_ppl();
524 #endif
525
526 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
527 /* KTRR
528 *
529 * Lock physical KTRR region. KTRR region is read-only. Memory outside
530 * the region is not executable at EL1.
531 */
532
533 rorgn_lockdown();
534 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
535
536 #endif /* CONFIG_KERNEL_INTEGRITY */
537
538
539 if (NULL != invalidate_hmac_function) {
540 invalidate_hmac_function();
541 }
542
543 lockdown_done = 1;
544 }
545
546
547 char *
machine_boot_info(__unused char * buf,__unused vm_size_t size)548 machine_boot_info(
549 __unused char *buf,
550 __unused vm_size_t size)
551 {
552 return PE_boot_args();
553 }
554
555 void
slave_machine_init(__unused void * param)556 slave_machine_init(__unused void *param)
557 {
558 cpu_machine_init(); /* Initialize the processor */
559 clock_init(); /* Init the clock */
560 }
561
562 /*
563 * Routine: machine_processor_shutdown
564 * Function:
565 */
566 thread_t
machine_processor_shutdown(__unused thread_t thread,void (* doshutdown)(processor_t),processor_t processor)567 machine_processor_shutdown(
568 __unused thread_t thread,
569 void (*doshutdown)(processor_t),
570 processor_t processor)
571 {
572 return Shutdown_context(doshutdown, processor);
573 }
574
575 /*
576 * Routine: ml_init_lock_timeout
577 * Function:
578 */
579 void
ml_init_lock_timeout(void)580 ml_init_lock_timeout(void)
581 {
582 /*
583 * This function is called after STARUP_SUB_TIMEOUTS
584 * initialization, so using the "legacy" boot-args here overrides
585 * the ml-timeout-... configuration. (Given that these boot-args
586 * here are usually explicitly specified, this makes sense by
587 * overriding ml-timeout-..., which may come from the device tree.
588 */
589
590 uint64_t lto_timeout_ns;
591 uint64_t lto_abstime;
592 uint32_t slto;
593
594 if (PE_parse_boot_argn("slto_us", &slto, sizeof(slto))) {
595 lto_timeout_ns = slto * NSEC_PER_USEC;
596 nanoseconds_to_absolutetime(lto_timeout_ns, <o_abstime);
597 os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
598 } else {
599 lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
600 absolutetime_to_nanoseconds(lto_abstime, <o_timeout_ns);
601 }
602
603 os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
604
605 if (PE_parse_boot_argn("tlto_us", &slto, sizeof(slto))) {
606 nanoseconds_to_absolutetime(slto * NSEC_PER_USEC, <o_abstime);
607 os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
608 } else if (lto_abstime != 0) {
609 os_atomic_store(&TLockTimeOut, lto_abstime >> 1, relaxed);
610 } // else take default from MACHINE_TIMEOUT.
611
612 uint64_t mtxspin;
613 uint64_t mtx_abstime;
614 if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof(mtxspin))) {
615 if (mtxspin > USEC_PER_SEC >> 4) {
616 mtxspin = USEC_PER_SEC >> 4;
617 }
618 nanoseconds_to_absolutetime(mtxspin * NSEC_PER_USEC, &mtx_abstime);
619 os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
620 } else {
621 mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
622 }
623
624 low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
625 /*
626 * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
627 * real_ncpus is not set at this time
628 *
629 * NOTE: active spinning is disabled in arm. It can be activated
630 * by setting high_MutexSpin through the sysctl.
631 */
632 high_MutexSpin = low_MutexSpin;
633
634 uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
635 PE_parse_boot_argn("max_wfe_us", &maxwfeus, sizeof(maxwfeus));
636 nanoseconds_to_absolutetime(maxwfeus * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
637
638 #if CONFIG_PV_TICKET
639 kprintf("pv locks %sabled\n", has_lock_pv? "en" : "dis");
640 #endif
641 }
642
643 /*
644 * This is called when all of the ml_processor_info_t structures have been
645 * initialized and all the processors have been started through processor_start().
646 *
647 * Required by the scheduler subsystem.
648 */
649 void
ml_cpu_init_completed(void)650 ml_cpu_init_completed(void)
651 {
652 if (SCHED(cpu_init_completed) != NULL) {
653 SCHED(cpu_init_completed)();
654 }
655 }
656
657 /*
658 * These are called from the machine-independent routine cpu_up()
659 * to perform machine-dependent info updates.
660 *
661 * The update to CPU counts needs to be separate from other actions
662 * because we don't update the counts when CLPC causes temporary
663 * cluster powerdown events, as these must be transparent to the user.
664 */
665 void
ml_cpu_up(void)666 ml_cpu_up(void)
667 {
668 }
669
670 void
ml_cpu_up_update_counts(int cpu_id)671 ml_cpu_up_update_counts(int cpu_id)
672 {
673 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
674
675 os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
676
677 os_atomic_inc(&machine_info.physical_cpu, relaxed);
678 os_atomic_inc(&machine_info.logical_cpu, relaxed);
679 }
680
681 /*
682 * These are called from the machine-independent routine cpu_down()
683 * to perform machine-dependent info updates.
684 *
685 * The update to CPU counts needs to be separate from other actions
686 * because we don't update the counts when CLPC causes temporary
687 * cluster powerdown events, as these must be transparent to the user.
688 */
689 void
ml_cpu_down(void)690 ml_cpu_down(void)
691 {
692 /*
693 * If we want to deal with outstanding IPIs, we need to
694 * do relatively early in the processor_doshutdown path,
695 * as we pend decrementer interrupts using the IPI
696 * mechanism if we cannot immediately service them (if
697 * IRQ is masked). Do so now.
698 *
699 * We aren't on the interrupt stack here; would it make
700 * more sense to disable signaling and then enable
701 * interrupts? It might be a bit cleaner.
702 */
703 cpu_data_t *cpu_data_ptr = getCpuDatap();
704 cpu_data_ptr->cpu_running = FALSE;
705
706 if (cpu_data_ptr != &BootCpuData) {
707 /*
708 * Move all of this cpu's timers to the master/boot cpu,
709 * and poke it in case there's a sooner deadline for it to schedule.
710 */
711 timer_queue_shutdown(&cpu_data_ptr->rtclock_timer.queue);
712 kern_return_t rv = cpu_xcall(BootCpuData.cpu_number, &timer_queue_expire_local, &ml_cpu_down);
713 if (rv != KERN_SUCCESS) {
714 panic("ml_cpu_down: IPI failure %d", rv);
715 }
716 }
717
718 cpu_signal_handler_internal(TRUE);
719 }
720 void
ml_cpu_down_update_counts(int cpu_id)721 ml_cpu_down_update_counts(int cpu_id)
722 {
723 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
724
725 os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
726
727 os_atomic_dec(&machine_info.physical_cpu, relaxed);
728 os_atomic_dec(&machine_info.logical_cpu, relaxed);
729 }
730
731
732 unsigned int
ml_get_machine_mem(void)733 ml_get_machine_mem(void)
734 {
735 return machine_info.memory_size;
736 }
737
738 __attribute__((noreturn))
739 void
halt_all_cpus(boolean_t reboot)740 halt_all_cpus(boolean_t reboot)
741 {
742 if (reboot) {
743 printf("MACH Reboot\n");
744 PEHaltRestart(kPERestartCPU);
745 } else {
746 printf("CPU halted\n");
747 PEHaltRestart(kPEHaltCPU);
748 }
749 while (1) {
750 ;
751 }
752 }
753
754 __attribute__((noreturn))
755 void
halt_cpu(void)756 halt_cpu(void)
757 {
758 halt_all_cpus(FALSE);
759 }
760
761 /*
762 * Routine: machine_signal_idle
763 * Function:
764 */
765 void
machine_signal_idle(processor_t processor)766 machine_signal_idle(
767 processor_t processor)
768 {
769 cpu_signal(processor_to_cpu_datap(processor), SIGPnop, (void *)NULL, (void *)NULL);
770 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
771 }
772
773 void
machine_signal_idle_deferred(processor_t processor)774 machine_signal_idle_deferred(
775 processor_t processor)
776 {
777 cpu_signal_deferred(processor_to_cpu_datap(processor));
778 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
779 }
780
781 void
machine_signal_idle_cancel(processor_t processor)782 machine_signal_idle_cancel(
783 processor_t processor)
784 {
785 cpu_signal_cancel(processor_to_cpu_datap(processor));
786 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
787 }
788
789 /*
790 * Routine: ml_install_interrupt_handler
791 * Function: Initialize Interrupt Handler
792 */
793 void
ml_install_interrupt_handler(void * nub,int source,void * target,IOInterruptHandler handler,void * refCon)794 ml_install_interrupt_handler(
795 void *nub,
796 int source,
797 void *target,
798 IOInterruptHandler handler,
799 void *refCon)
800 {
801 cpu_data_t *cpu_data_ptr;
802 boolean_t current_state;
803
804 current_state = ml_set_interrupts_enabled(FALSE);
805 cpu_data_ptr = getCpuDatap();
806
807 cpu_data_ptr->interrupt_nub = nub;
808 cpu_data_ptr->interrupt_source = source;
809 cpu_data_ptr->interrupt_target = target;
810 cpu_data_ptr->interrupt_handler = handler;
811 cpu_data_ptr->interrupt_refCon = refCon;
812
813 (void) ml_set_interrupts_enabled(current_state);
814 }
815
816 /*
817 * Routine: ml_init_interrupt
818 * Function: Initialize Interrupts
819 */
820 void
ml_init_interrupt(void)821 ml_init_interrupt(void)
822 {
823 #if defined(HAS_IPI)
824 /*
825 * ml_init_interrupt will get called once for each CPU, but this is redundant
826 * because there is only one global copy of the register for skye. do it only
827 * on the bootstrap cpu
828 */
829 if (getCpuDatap()->cluster_master) {
830 ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
831 }
832 #endif
833 }
834
835 /*
836 * Routine: ml_init_timebase
837 * Function: register and setup Timebase, Decremeter services
838 */
839 void
ml_init_timebase(void * args,tbd_ops_t tbd_funcs,vm_offset_t int_address,vm_offset_t int_value __unused)840 ml_init_timebase(
841 void *args,
842 tbd_ops_t tbd_funcs,
843 vm_offset_t int_address,
844 vm_offset_t int_value __unused)
845 {
846 cpu_data_t *cpu_data_ptr;
847
848 cpu_data_ptr = (cpu_data_t *)args;
849
850 if ((cpu_data_ptr == &BootCpuData)
851 && (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
852 rtclock_timebase_func = *tbd_funcs;
853 rtclock_timebase_addr = int_address;
854 }
855 }
856
857 #define ML_READPROP_MANDATORY UINT64_MAX
858
859 static uint64_t
ml_readprop(const DTEntry entry,const char * propertyName,uint64_t default_value)860 ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
861 {
862 void const *prop;
863 unsigned int propSize;
864
865 if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) {
866 if (propSize == sizeof(uint8_t)) {
867 return *((uint8_t const *)prop);
868 } else if (propSize == sizeof(uint16_t)) {
869 return *((uint16_t const *)prop);
870 } else if (propSize == sizeof(uint32_t)) {
871 return *((uint32_t const *)prop);
872 } else if (propSize == sizeof(uint64_t)) {
873 return *((uint64_t const *)prop);
874 } else {
875 panic("CPU property '%s' has bad size %u", propertyName, propSize);
876 }
877 } else {
878 if (default_value == ML_READPROP_MANDATORY) {
879 panic("Missing mandatory property '%s'", propertyName);
880 }
881 return default_value;
882 }
883 }
884
885 static boolean_t
ml_read_reg_range(const DTEntry entry,const char * propertyName,uint64_t * pa_ptr,uint64_t * len_ptr)886 ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
887 {
888 uint64_t const *prop;
889 unsigned int propSize;
890
891 if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) {
892 return FALSE;
893 }
894
895 if (propSize != sizeof(uint64_t) * 2) {
896 panic("Wrong property size for %s", propertyName);
897 }
898
899 *pa_ptr = prop[0];
900 *len_ptr = prop[1];
901 return TRUE;
902 }
903
904 static boolean_t
ml_is_boot_cpu(const DTEntry entry)905 ml_is_boot_cpu(const DTEntry entry)
906 {
907 void const *prop;
908 unsigned int propSize;
909
910 if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) {
911 panic("unable to retrieve state for cpu");
912 }
913
914 if (strncmp((char const *)prop, "running", propSize) == 0) {
915 return TRUE;
916 } else {
917 return FALSE;
918 }
919 }
920
921 static void
ml_read_chip_revision(unsigned int * rev __unused)922 ml_read_chip_revision(unsigned int *rev __unused)
923 {
924 // The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
925 #ifdef APPLE_ARM64_ARCH_FAMILY
926 DTEntry entryP;
927
928 if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) {
929 *rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN);
930 } else {
931 *rev = CPU_VERSION_UNKNOWN;
932 }
933 #endif
934 }
935
936 void
ml_parse_cpu_topology(void)937 ml_parse_cpu_topology(void)
938 {
939 DTEntry entry, child __unused;
940 OpaqueDTEntryIterator iter;
941 uint32_t cpu_boot_arg = MAX_CPUS;
942 uint64_t cpumask_boot_arg = ULLONG_MAX;
943 int err;
944
945 int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
946 int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
947 const boolean_t cpus_boot_arg_present = PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg));
948 const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn("cpumask", &cpumask_boot_arg, sizeof(cpumask_boot_arg));
949
950 // The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
951 // so that we trigger a panic later in the boot process, once serial is enabled.
952 if (cpus_boot_arg_present && cpumask_boot_arg_present) {
953 cpu_config_correct = false;
954 }
955
956 err = SecureDTLookupEntry(NULL, "/cpus", &entry);
957 assert(err == kSuccess);
958
959 err = SecureDTInitEntryIterator(entry, &iter);
960 assert(err == kSuccess);
961
962 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
963 cluster_offsets[i] = -1;
964 cluster_phys_to_logical[i] = -1;
965 cluster_max_cpu_phys_id[i] = 0;
966 }
967
968 while (kSuccess == SecureDTIterateEntries(&iter, &child)) {
969 boolean_t is_boot_cpu = ml_is_boot_cpu(child);
970 boolean_t cpu_enabled = cpumask_boot_arg & 1;
971 cpumask_boot_arg >>= 1;
972
973 // Boot CPU disabled in cpumask. Flag this so that we trigger a panic
974 // later in the boot process, once serial is enabled.
975 if (is_boot_cpu && !cpu_enabled) {
976 cpu_config_correct = false;
977 }
978
979 // Ignore this CPU if it has been disabled by the cpumask= boot-arg.
980 if (!is_boot_cpu && !cpu_enabled) {
981 continue;
982 }
983
984 // If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
985 // been added to the topology struct yet, and we only have one slot left, then skip
986 // every other non-boot CPU in order to leave room for the boot CPU.
987 //
988 // e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
989 // array will list CPU0, CPU1, and CPU4. CPU2-CPU3 and CPU5-CPUn will be omitted.
990 if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
991 continue;
992 }
993 if (topology_info.num_cpus >= cpu_boot_arg) {
994 break;
995 }
996
997 ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
998
999 cpu->cpu_id = topology_info.num_cpus++;
1000 assert(cpu->cpu_id < MAX_CPUS);
1001 topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
1002
1003 cpu->reserved = 0;
1004 topology_info.reserved = 0;
1005
1006 cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY);
1007
1008 cpu->l2_access_penalty = (uint32_t)ml_readprop(child, "l2-access-penalty", 0);
1009 cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0);
1010 cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0);
1011 cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
1012 cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
1013
1014 ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
1015 ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
1016 ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
1017 cpu->cluster_type = CLUSTER_TYPE_SMP;
1018
1019 int cluster_type = (int)ml_readprop(child, "cluster-type", 0);
1020 if (cluster_type == 'E') {
1021 cpu->cluster_type = CLUSTER_TYPE_E;
1022 } else if (cluster_type == 'P') {
1023 cpu->cluster_type = CLUSTER_TYPE_P;
1024 }
1025
1026 topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1027
1028 /*
1029 * Since we want to keep a linear cluster ID space, we cannot just rely
1030 * on the value provided by EDT. Instead, use the MPIDR value to see if we have
1031 * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1032 */
1033 #if HAS_CLUSTER
1034 uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1035 #else
1036 uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1037 #endif
1038 assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1039 cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
1040 topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1041
1042 assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1043
1044 ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1045 if (cluster->num_cpus == 0) {
1046 assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1047
1048 topology_info.num_clusters++;
1049 topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1050 topology_info.cluster_types |= (1 << cpu->cluster_type);
1051
1052 cluster->cluster_id = cpu->cluster_id;
1053 cluster->cluster_type = cpu->cluster_type;
1054 cluster->first_cpu_id = cpu->cpu_id;
1055 assert(cluster_phys_to_logical[phys_cluster_id] == -1);
1056 cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1057
1058 topology_info.cluster_type_num_clusters[cluster->cluster_type]++;
1059
1060 // Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1061 // If we wind up with a bunch of these, we might want to create separate per-cluster
1062 // EDT nodes and have the CPU nodes reference them through a phandle.
1063 ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len);
1064 ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len);
1065 }
1066
1067 #if HAS_CLUSTER
1068 if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1069 cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1070 }
1071 #endif
1072
1073 cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1074 cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1075
1076 cluster->num_cpus++;
1077 cluster->cpu_mask |= 1ULL << cpu->cpu_id;
1078
1079 if (is_boot_cpu) {
1080 assert(topology_info.boot_cpu == NULL);
1081 topology_info.boot_cpu = cpu;
1082 topology_info.boot_cluster = cluster;
1083 }
1084 }
1085
1086 #if HAS_CLUSTER
1087 /*
1088 * Build the cluster offset array, ensuring that the region reserved
1089 * for each physical cluster contains enough entries to be indexed
1090 * by the maximum physical CPU ID (AFF0) within the cluster.
1091 */
1092 unsigned int cur_cluster_offset = 0;
1093 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1094 if (cluster_phys_to_logical[i] != -1) {
1095 cluster_offsets[i] = cur_cluster_offset;
1096 cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
1097 }
1098 }
1099 assert(cur_cluster_offset <= MAX_CPUS);
1100 #else
1101 /*
1102 * For H10, there are really 2 physical clusters, but they are not separated
1103 * into distinct ACCs. AFF1 therefore always reports 0, and AFF0 numbering
1104 * is linear across both clusters. For the purpose of MPIDR_EL1-based indexing,
1105 * treat H10 and earlier devices as though they contain a single cluster.
1106 */
1107 cluster_offsets[0] = 0;
1108 #endif
1109 assert(topology_info.boot_cpu != NULL);
1110 ml_read_chip_revision(&topology_info.chip_revision);
1111
1112 /*
1113 * Set TPIDR_EL0 to indicate the correct cpu number, as we may
1114 * not be booting from cpu 0. Userspace will consume the current
1115 * CPU number through this register. For non-boot cores, this is
1116 * done in start.s (start_cpu) using the cpu_number field of the
1117 * per-cpu data object.
1118 */
1119 uint64_t cpuid = topology_info.boot_cpu->cpu_id;
1120
1121 __builtin_arm_wsr64("TPIDR_EL0", cpuid & MACHDEP_TPIDR_CPUNUM_MASK);
1122 assert((cpuid & MACHDEP_TPIDR_CPUNUM_MASK) == cpuid);
1123 __builtin_arm_wsr64("TPIDRRO_EL0", 0);
1124 }
1125
1126 const ml_topology_info_t *
ml_get_topology_info(void)1127 ml_get_topology_info(void)
1128 {
1129 return &topology_info;
1130 }
1131
1132 void
ml_map_cpu_pio(void)1133 ml_map_cpu_pio(void)
1134 {
1135 unsigned int i;
1136
1137 for (i = 0; i < topology_info.num_cpus; i++) {
1138 ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1139 if (cpu->cpu_IMPL_pa) {
1140 cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len);
1141 cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len);
1142 }
1143 if (cpu->cpu_UTTDBG_pa) {
1144 cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len);
1145 }
1146 }
1147
1148 for (i = 0; i < topology_info.num_clusters; i++) {
1149 ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1150 if (cluster->acc_IMPL_pa) {
1151 cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len);
1152 }
1153 if (cluster->cpm_IMPL_pa) {
1154 cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len);
1155 }
1156 }
1157 }
1158
1159 unsigned int
ml_get_cpu_count(void)1160 ml_get_cpu_count(void)
1161 {
1162 return topology_info.num_cpus;
1163 }
1164
1165 unsigned int
ml_get_cluster_count(void)1166 ml_get_cluster_count(void)
1167 {
1168 return topology_info.num_clusters;
1169 }
1170
1171 int
ml_get_boot_cpu_number(void)1172 ml_get_boot_cpu_number(void)
1173 {
1174 return topology_info.boot_cpu->cpu_id;
1175 }
1176
1177 cluster_type_t
ml_get_boot_cluster_type(void)1178 ml_get_boot_cluster_type(void)
1179 {
1180 return topology_info.boot_cluster->cluster_type;
1181 }
1182
1183 int
ml_get_cpu_number(uint32_t phys_id)1184 ml_get_cpu_number(uint32_t phys_id)
1185 {
1186 phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
1187
1188 for (unsigned i = 0; i < topology_info.num_cpus; i++) {
1189 if (topology_info.cpus[i].phys_id == phys_id) {
1190 return i;
1191 }
1192 }
1193
1194 return -1;
1195 }
1196
1197 int
ml_get_cluster_number(uint32_t phys_id)1198 ml_get_cluster_number(uint32_t phys_id)
1199 {
1200 int cpu_id = ml_get_cpu_number(phys_id);
1201 if (cpu_id < 0) {
1202 return -1;
1203 }
1204
1205 ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1206
1207 return cpu->cluster_id;
1208 }
1209
1210 unsigned int
ml_get_cpu_number_local(void)1211 ml_get_cpu_number_local(void)
1212 {
1213 uint64_t mpidr_el1_value = 0;
1214 unsigned cpu_id;
1215
1216 /* We identify the CPU based on the constant bits of MPIDR_EL1. */
1217 MRS(mpidr_el1_value, "MPIDR_EL1");
1218 cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value);
1219
1220 assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1221
1222 return cpu_id;
1223 }
1224
1225 int
ml_get_cluster_number_local()1226 ml_get_cluster_number_local()
1227 {
1228 uint64_t mpidr_el1_value = 0;
1229 unsigned cluster_id;
1230
1231 /* We identify the cluster based on the constant bits of MPIDR_EL1. */
1232 MRS(mpidr_el1_value, "MPIDR_EL1");
1233 cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value);
1234
1235 assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1236
1237 return cluster_id;
1238 }
1239
1240 int
ml_get_max_cpu_number(void)1241 ml_get_max_cpu_number(void)
1242 {
1243 return topology_info.max_cpu_id;
1244 }
1245
1246 int
ml_get_max_cluster_number(void)1247 ml_get_max_cluster_number(void)
1248 {
1249 return topology_info.max_cluster_id;
1250 }
1251
1252 unsigned int
ml_get_first_cpu_id(unsigned int cluster_id)1253 ml_get_first_cpu_id(unsigned int cluster_id)
1254 {
1255 return topology_info.clusters[cluster_id].first_cpu_id;
1256 }
1257
1258
1259 void
ml_lockdown_init()1260 ml_lockdown_init()
1261 {
1262 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
1263 rorgn_stash_range();
1264 #endif
1265 }
1266
1267 kern_return_t
ml_lockdown_handler_register(lockdown_handler_t f,void * this)1268 ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1269 {
1270 if (!f) {
1271 return KERN_FAILURE;
1272 }
1273
1274 assert(lockdown_done);
1275 f(this); // XXX: f this whole function
1276
1277 return KERN_SUCCESS;
1278 }
1279
1280
1281
1282 extern lck_mtx_t pset_create_lock;
1283
1284 kern_return_t
ml_processor_register(ml_processor_info_t * in_processor_info,processor_t * processor_out,ipi_handler_t * ipi_handler_out,perfmon_interrupt_handler_func * pmi_handler_out)1285 ml_processor_register(ml_processor_info_t *in_processor_info,
1286 processor_t *processor_out, ipi_handler_t *ipi_handler_out,
1287 perfmon_interrupt_handler_func *pmi_handler_out)
1288 {
1289 cpu_data_t *this_cpu_datap;
1290 processor_set_t pset;
1291 boolean_t is_boot_cpu;
1292 static unsigned int reg_cpu_count = 0;
1293
1294 if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1295 return KERN_FAILURE;
1296 }
1297
1298 if ((unsigned)OSIncrementAtomic((SInt32*)®_cpu_count) >= topology_info.num_cpus) {
1299 return KERN_FAILURE;
1300 }
1301
1302 if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1303 is_boot_cpu = FALSE;
1304 this_cpu_datap = cpu_data_alloc(FALSE);
1305 cpu_data_init(this_cpu_datap);
1306 } else {
1307 this_cpu_datap = &BootCpuData;
1308 is_boot_cpu = TRUE;
1309 }
1310
1311 assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1312
1313 this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1314
1315 if (!is_boot_cpu) {
1316 this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1317
1318 if (cpu_data_register(this_cpu_datap) != KERN_SUCCESS) {
1319 goto processor_register_error;
1320 }
1321 assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1322 }
1323
1324 this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1325 this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1326 nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency);
1327 this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr);
1328
1329 this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1330 this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1331
1332 this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1333 this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1334 this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1335 this_cpu_datap->cpu_l2_access_penalty = in_processor_info->l2_access_penalty;
1336
1337 this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1338 this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1339 this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1340 this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1341 this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1342 this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1343
1344 #if HAS_CLUSTER
1345 this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1346 #else /* HAS_CLUSTER */
1347 this_cpu_datap->cluster_master = is_boot_cpu;
1348 #endif /* HAS_CLUSTER */
1349 lck_mtx_lock(&pset_create_lock);
1350 pset = pset_find(in_processor_info->cluster_id, NULL);
1351 kprintf("[%d]%s>pset_find(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cluster_id, pset ? pset->pset_id : -1);
1352 if (pset == NULL) {
1353 #if __AMP__
1354 pset_cluster_type_t pset_cluster_type = this_cpu_datap->cpu_cluster_type == CLUSTER_TYPE_E ? PSET_AMP_E : PSET_AMP_P;
1355 pset = pset_create(ml_get_boot_cluster_type() == this_cpu_datap->cpu_cluster_type ? &pset_node0 : &pset_node1, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1356 assert(pset != PROCESSOR_SET_NULL);
1357 kprintf("[%d]%s>pset_create(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, this_cpu_datap->cpu_cluster_id, pset->pset_id);
1358 #else /* __AMP__ */
1359 pset_cluster_type_t pset_cluster_type = PSET_SMP;
1360 pset = pset_create(&pset_node0, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1361 assert(pset != PROCESSOR_SET_NULL);
1362 #endif /* __AMP__ */
1363 }
1364 kprintf("[%d]%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1365 lck_mtx_unlock(&pset_create_lock);
1366
1367 processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1368 if (!is_boot_cpu) {
1369 processor_init(processor, this_cpu_datap->cpu_number, pset);
1370
1371 if (this_cpu_datap->cpu_l2_access_penalty) {
1372 /*
1373 * Cores that have a non-zero L2 access penalty compared
1374 * to the boot processor should be de-prioritized by the
1375 * scheduler, so that threads use the cores with better L2
1376 * preferentially.
1377 */
1378 processor_set_primary(processor, master_processor);
1379 }
1380 }
1381
1382 *processor_out = processor;
1383 *ipi_handler_out = cpu_signal_handler;
1384 #if CPMU_AIC_PMI && MONOTONIC
1385 *pmi_handler_out = mt_cpmu_aic_pmi;
1386 #else
1387 *pmi_handler_out = NULL;
1388 #endif /* CPMU_AIC_PMI && MONOTONIC */
1389 if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1390 *in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1391 }
1392
1393 #if KPC
1394 if (kpc_register_cpu(this_cpu_datap) != TRUE) {
1395 goto processor_register_error;
1396 }
1397 #endif /* KPC */
1398
1399 if (!is_boot_cpu) {
1400 random_cpu_init(this_cpu_datap->cpu_number);
1401 // now let next CPU register itself
1402 OSIncrementAtomic((SInt32*)&real_ncpus);
1403 }
1404
1405 return KERN_SUCCESS;
1406
1407 processor_register_error:
1408 #if KPC
1409 kpc_unregister_cpu(this_cpu_datap);
1410 #endif /* KPC */
1411 if (!is_boot_cpu) {
1412 cpu_data_free(this_cpu_datap);
1413 }
1414
1415 return KERN_FAILURE;
1416 }
1417
1418 void
ml_init_arm_debug_interface(void * in_cpu_datap,vm_offset_t virt_address)1419 ml_init_arm_debug_interface(
1420 void * in_cpu_datap,
1421 vm_offset_t virt_address)
1422 {
1423 ((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1424 do_debugid();
1425 }
1426
1427 /*
1428 * Routine: init_ast_check
1429 * Function:
1430 */
1431 void
init_ast_check(__unused processor_t processor)1432 init_ast_check(
1433 __unused processor_t processor)
1434 {
1435 }
1436
1437 /*
1438 * Routine: cause_ast_check
1439 * Function:
1440 */
1441 void
cause_ast_check(processor_t processor)1442 cause_ast_check(
1443 processor_t processor)
1444 {
1445 if (current_processor() != processor) {
1446 cpu_signal(processor_to_cpu_datap(processor), SIGPast, (void *)NULL, (void *)NULL);
1447 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 1 /* ast */, 0, 0, 0);
1448 }
1449 }
1450
1451 extern uint32_t cpu_idle_count;
1452
1453 void
ml_get_power_state(boolean_t * icp,boolean_t * pidlep)1454 ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
1455 {
1456 *icp = ml_at_interrupt_context();
1457 *pidlep = (cpu_idle_count == real_ncpus);
1458 }
1459
1460 /*
1461 * Routine: ml_cause_interrupt
1462 * Function: Generate a fake interrupt
1463 */
1464 void
ml_cause_interrupt(void)1465 ml_cause_interrupt(void)
1466 {
1467 return; /* BS_XXX */
1468 }
1469
1470 /* Map memory map IO space */
1471 vm_offset_t
ml_io_map(vm_offset_t phys_addr,vm_size_t size)1472 ml_io_map(
1473 vm_offset_t phys_addr,
1474 vm_size_t size)
1475 {
1476 return io_map(phys_addr, size, VM_WIMG_IO, VM_PROT_DEFAULT, false);
1477 }
1478
1479 /* Map memory map IO space (with protections specified) */
1480 vm_offset_t
ml_io_map_with_prot(vm_offset_t phys_addr,vm_size_t size,vm_prot_t prot)1481 ml_io_map_with_prot(
1482 vm_offset_t phys_addr,
1483 vm_size_t size,
1484 vm_prot_t prot)
1485 {
1486 return io_map(phys_addr, size, VM_WIMG_IO, prot, false);
1487 }
1488
1489 vm_offset_t
ml_io_map_unmappable(vm_offset_t phys_addr,vm_size_t size,unsigned int flags)1490 ml_io_map_unmappable(
1491 vm_offset_t phys_addr,
1492 vm_size_t size,
1493 unsigned int flags)
1494 {
1495 return io_map(phys_addr, size, flags, VM_PROT_DEFAULT, true);
1496 }
1497
1498 vm_offset_t
ml_io_map_wcomb(vm_offset_t phys_addr,vm_size_t size)1499 ml_io_map_wcomb(
1500 vm_offset_t phys_addr,
1501 vm_size_t size)
1502 {
1503 return io_map(phys_addr, size, VM_WIMG_WCOMB, VM_PROT_DEFAULT, false);
1504 }
1505
1506 void
ml_io_unmap(vm_offset_t addr,vm_size_t sz)1507 ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1508 {
1509 pmap_remove(kernel_pmap, addr, addr + sz);
1510 kmem_free(kernel_map, addr, sz);
1511 }
1512
1513 vm_map_address_t
ml_map_high_window(vm_offset_t phys_addr,vm_size_t len)1514 ml_map_high_window(
1515 vm_offset_t phys_addr,
1516 vm_size_t len)
1517 {
1518 return pmap_map_high_window_bd(phys_addr, len, VM_PROT_READ | VM_PROT_WRITE);
1519 }
1520
1521 vm_offset_t
ml_static_ptovirt(vm_offset_t paddr)1522 ml_static_ptovirt(
1523 vm_offset_t paddr)
1524 {
1525 return phystokv(paddr);
1526 }
1527
1528 vm_offset_t
ml_static_slide(vm_offset_t vaddr)1529 ml_static_slide(
1530 vm_offset_t vaddr)
1531 {
1532 vm_offset_t slid_vaddr = vaddr + vm_kernel_slide;
1533
1534 if ((slid_vaddr < vm_kernelcache_base) || (slid_vaddr >= vm_kernelcache_top)) {
1535 /* This is only intended for use on kernelcache addresses. */
1536 return 0;
1537 }
1538
1539 /*
1540 * Because the address is in the kernelcache, we can do a simple
1541 * slide calculation.
1542 */
1543 return slid_vaddr;
1544 }
1545
1546 vm_offset_t
ml_static_unslide(vm_offset_t vaddr)1547 ml_static_unslide(
1548 vm_offset_t vaddr)
1549 {
1550 if ((vaddr < vm_kernelcache_base) || (vaddr >= vm_kernelcache_top)) {
1551 /* This is only intended for use on kernelcache addresses. */
1552 return 0;
1553 }
1554
1555 return vaddr - vm_kernel_slide;
1556 }
1557
1558 extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1559
1560 kern_return_t
ml_static_protect(vm_offset_t vaddr,vm_size_t size,vm_prot_t new_prot)1561 ml_static_protect(
1562 vm_offset_t vaddr, /* kernel virtual address */
1563 vm_size_t size,
1564 vm_prot_t new_prot)
1565 {
1566 pt_entry_t arm_prot = 0;
1567 pt_entry_t arm_block_prot = 0;
1568 vm_offset_t vaddr_cur;
1569 ppnum_t ppn;
1570 kern_return_t result = KERN_SUCCESS;
1571
1572 if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1573 panic("ml_static_protect(): %p < %p", (void *) vaddr, (void *) VM_MIN_KERNEL_ADDRESS);
1574 return KERN_FAILURE;
1575 }
1576
1577 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1578
1579 if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1580 panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1581 }
1582 if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1583 panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1584 }
1585
1586 /* Set up the protection bits, and block bits so we can validate block mappings. */
1587 if (new_prot & VM_PROT_WRITE) {
1588 arm_prot |= ARM_PTE_AP(AP_RWNA);
1589 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RWNA);
1590 } else {
1591 arm_prot |= ARM_PTE_AP(AP_RONA);
1592 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RONA);
1593 }
1594
1595 arm_prot |= ARM_PTE_NX;
1596 arm_block_prot |= ARM_TTE_BLOCK_NX;
1597
1598 if (!(new_prot & VM_PROT_EXECUTE)) {
1599 arm_prot |= ARM_PTE_PNX;
1600 arm_block_prot |= ARM_TTE_BLOCK_PNX;
1601 }
1602
1603 for (vaddr_cur = vaddr;
1604 vaddr_cur < trunc_page_64(vaddr + size);
1605 vaddr_cur += PAGE_SIZE) {
1606 ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1607 if (ppn != (vm_offset_t) NULL) {
1608 tt_entry_t *tte2;
1609 pt_entry_t *pte_p;
1610 pt_entry_t ptmp;
1611
1612 #if XNU_MONITOR
1613 assert(!pmap_is_monitor(ppn));
1614 assert(!TEST_PAGE_RATIO_4);
1615 #endif
1616
1617 tte2 = arm_kva_to_tte(vaddr_cur);
1618
1619 if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1620 if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1621 ((*tte2 & (ARM_TTE_BLOCK_NXMASK | ARM_TTE_BLOCK_PNXMASK | ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1622 /*
1623 * We can support ml_static_protect on a block mapping if the mapping already has
1624 * the desired protections. We still want to run checks on a per-page basis.
1625 */
1626 continue;
1627 }
1628
1629 result = KERN_FAILURE;
1630 break;
1631 }
1632
1633 pte_p = (pt_entry_t *)&((tt_entry_t*)(phystokv((*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1634 ptmp = *pte_p;
1635
1636 if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot)) {
1637 /*
1638 * The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
1639 * protections do not match the desired protections, then we will fail (as we cannot update
1640 * this mapping without updating other mappings as well).
1641 */
1642 result = KERN_FAILURE;
1643 break;
1644 }
1645
1646 __unreachable_ok_push
1647 if (TEST_PAGE_RATIO_4) {
1648 {
1649 unsigned int i;
1650 pt_entry_t *ptep_iter;
1651
1652 ptep_iter = pte_p;
1653 for (i = 0; i < 4; i++, ptep_iter++) {
1654 /* Note that there is a hole in the HINT sanity checking here. */
1655 ptmp = *ptep_iter;
1656
1657 /* We only need to update the page tables if the protections do not match. */
1658 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1659 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1660 *ptep_iter = ptmp;
1661 }
1662 }
1663 }
1664 } else {
1665 ptmp = *pte_p;
1666 /* We only need to update the page tables if the protections do not match. */
1667 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1668 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1669 *pte_p = ptmp;
1670 }
1671 }
1672 __unreachable_ok_pop
1673 }
1674 }
1675
1676 if (vaddr_cur > vaddr) {
1677 assert(((vaddr_cur - vaddr) & 0xFFFFFFFF00000000ULL) == 0);
1678 flush_mmu_tlb_region(vaddr, (uint32_t)(vaddr_cur - vaddr));
1679 }
1680
1681
1682 return result;
1683 }
1684
1685 /*
1686 * Routine: ml_static_mfree
1687 * Function:
1688 */
1689 void
ml_static_mfree(vm_offset_t vaddr,vm_size_t size)1690 ml_static_mfree(
1691 vm_offset_t vaddr,
1692 vm_size_t size)
1693 {
1694 vm_offset_t vaddr_cur;
1695 vm_offset_t paddr_cur;
1696 ppnum_t ppn;
1697 uint32_t freed_pages = 0;
1698 uint32_t freed_kernelcache_pages = 0;
1699
1700
1701 /* It is acceptable (if bad) to fail to free. */
1702 if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1703 return;
1704 }
1705
1706 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1707
1708 for (vaddr_cur = vaddr;
1709 vaddr_cur < trunc_page_64(vaddr + size);
1710 vaddr_cur += PAGE_SIZE) {
1711 ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1712 if (ppn != (vm_offset_t) NULL) {
1713 /*
1714 * It is not acceptable to fail to update the protections on a page
1715 * we will release to the VM. We need to either panic or continue.
1716 * For now, we'll panic (to help flag if there is memory we can
1717 * reclaim).
1718 */
1719 if (ml_static_protect(vaddr_cur, PAGE_SIZE, VM_PROT_WRITE | VM_PROT_READ) != KERN_SUCCESS) {
1720 panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
1721 }
1722
1723 paddr_cur = ptoa(ppn);
1724
1725
1726 vm_page_create(ppn, (ppn + 1));
1727 freed_pages++;
1728 if (paddr_cur >= arm_vm_kernelcache_phys_start && paddr_cur < arm_vm_kernelcache_phys_end) {
1729 freed_kernelcache_pages++;
1730 }
1731 }
1732 }
1733 vm_page_lockspin_queues();
1734 vm_page_wire_count -= freed_pages;
1735 vm_page_wire_count_initial -= freed_pages;
1736 vm_page_kernelcache_count -= freed_kernelcache_pages;
1737 vm_page_unlock_queues();
1738 #if DEBUG
1739 kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
1740 #endif
1741 }
1742
1743
1744 /* virtual to physical on wired pages */
1745 vm_offset_t
ml_vtophys(vm_offset_t vaddr)1746 ml_vtophys(vm_offset_t vaddr)
1747 {
1748 return kvtophys(vaddr);
1749 }
1750
1751 /*
1752 * Routine: ml_nofault_copy
1753 * Function: Perform a physical mode copy if the source and destination have
1754 * valid translations in the kernel pmap. If translations are present, they are
1755 * assumed to be wired; e.g., no attempt is made to guarantee that the
1756 * translations obtained remain valid for the duration of the copy process.
1757 */
1758 vm_size_t
ml_nofault_copy(vm_offset_t virtsrc,vm_offset_t virtdst,vm_size_t size)1759 ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
1760 {
1761 addr64_t cur_phys_dst, cur_phys_src;
1762 vm_size_t count, nbytes = 0;
1763
1764 while (size > 0) {
1765 if (!(cur_phys_src = kvtophys(virtsrc))) {
1766 break;
1767 }
1768 if (!(cur_phys_dst = kvtophys(virtdst))) {
1769 break;
1770 }
1771 if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) ||
1772 !pmap_valid_address(trunc_page_64(cur_phys_src))) {
1773 break;
1774 }
1775 count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
1776 if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
1777 count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
1778 }
1779 if (count > size) {
1780 count = size;
1781 }
1782
1783 bcopy_phys(cur_phys_src, cur_phys_dst, count);
1784
1785 nbytes += count;
1786 virtsrc += count;
1787 virtdst += count;
1788 size -= count;
1789 }
1790
1791 return nbytes;
1792 }
1793
1794 /*
1795 * Routine: ml_validate_nofault
1796 * Function: Validate that ths address range has a valid translations
1797 * in the kernel pmap. If translations are present, they are
1798 * assumed to be wired; i.e. no attempt is made to guarantee
1799 * that the translation persist after the check.
1800 * Returns: TRUE if the range is mapped and will not cause a fault,
1801 * FALSE otherwise.
1802 */
1803
1804 boolean_t
ml_validate_nofault(vm_offset_t virtsrc,vm_size_t size)1805 ml_validate_nofault(
1806 vm_offset_t virtsrc, vm_size_t size)
1807 {
1808 addr64_t cur_phys_src;
1809 uint32_t count;
1810
1811 while (size > 0) {
1812 if (!(cur_phys_src = kvtophys(virtsrc))) {
1813 return FALSE;
1814 }
1815 if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
1816 return FALSE;
1817 }
1818 count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
1819 if (count > size) {
1820 count = (uint32_t)size;
1821 }
1822
1823 virtsrc += count;
1824 size -= count;
1825 }
1826
1827 return TRUE;
1828 }
1829
1830 void
ml_get_bouncepool_info(vm_offset_t * phys_addr,vm_size_t * size)1831 ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
1832 {
1833 *phys_addr = 0;
1834 *size = 0;
1835 }
1836
1837 void
active_rt_threads(__unused boolean_t active)1838 active_rt_threads(__unused boolean_t active)
1839 {
1840 }
1841
1842 static void
cpu_qos_cb_default(__unused int urgency,__unused uint64_t qos_param1,__unused uint64_t qos_param2)1843 cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
1844 {
1845 return;
1846 }
1847
1848 cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
1849
1850 void
cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)1851 cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
1852 {
1853 if (cpu_qos_cb != NULL) {
1854 cpu_qos_update = cpu_qos_cb;
1855 } else {
1856 cpu_qos_update = cpu_qos_cb_default;
1857 }
1858 }
1859
1860 void
thread_tell_urgency(thread_urgency_t urgency,uint64_t rt_period,uint64_t rt_deadline,uint64_t sched_latency __unused,__unused thread_t nthread)1861 thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
1862 {
1863 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
1864
1865 cpu_qos_update((int)urgency, rt_period, rt_deadline);
1866
1867 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
1868 }
1869
1870 void
machine_run_count(__unused uint32_t count)1871 machine_run_count(__unused uint32_t count)
1872 {
1873 }
1874
1875 processor_t
machine_choose_processor(__unused processor_set_t pset,processor_t processor)1876 machine_choose_processor(__unused processor_set_t pset, processor_t processor)
1877 {
1878 return processor;
1879 }
1880
1881 #if KASAN
1882 vm_offset_t ml_stack_base(void);
1883 vm_size_t ml_stack_size(void);
1884
1885 vm_offset_t
ml_stack_base(void)1886 ml_stack_base(void)
1887 {
1888 uintptr_t local = (uintptr_t) &local;
1889 vm_offset_t intstack_top_ptr;
1890
1891 intstack_top_ptr = getCpuDatap()->intstack_top;
1892 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1893 return intstack_top_ptr - INTSTACK_SIZE;
1894 } else {
1895 return current_thread()->kernel_stack;
1896 }
1897 }
1898 vm_size_t
ml_stack_size(void)1899 ml_stack_size(void)
1900 {
1901 uintptr_t local = (uintptr_t) &local;
1902 vm_offset_t intstack_top_ptr;
1903
1904 intstack_top_ptr = getCpuDatap()->intstack_top;
1905 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1906 return INTSTACK_SIZE;
1907 } else {
1908 return kernel_stack_size;
1909 }
1910 }
1911 #endif
1912
1913 #ifdef CONFIG_KCOV
1914
1915 kcov_cpu_data_t *
current_kcov_data(void)1916 current_kcov_data(void)
1917 {
1918 return ¤t_cpu_datap()->cpu_kcov_data;
1919 }
1920
1921 kcov_cpu_data_t *
cpu_kcov_data(int cpuid)1922 cpu_kcov_data(int cpuid)
1923 {
1924 return &cpu_datap(cpuid)->cpu_kcov_data;
1925 }
1926
1927 #endif /* CONFIG_KCOV */
1928
1929 boolean_t
machine_timeout_suspended(void)1930 machine_timeout_suspended(void)
1931 {
1932 return FALSE;
1933 }
1934
1935 kern_return_t
ml_interrupt_prewarm(__unused uint64_t deadline)1936 ml_interrupt_prewarm(__unused uint64_t deadline)
1937 {
1938 return KERN_FAILURE;
1939 }
1940
1941 /*
1942 * Assumes fiq, irq disabled.
1943 */
1944 void
ml_set_decrementer(uint32_t dec_value)1945 ml_set_decrementer(uint32_t dec_value)
1946 {
1947 cpu_data_t *cdp = getCpuDatap();
1948
1949 assert(ml_get_interrupts_enabled() == FALSE);
1950 cdp->cpu_decrementer = dec_value;
1951
1952 if (cdp->cpu_set_decrementer_func) {
1953 cdp->cpu_set_decrementer_func(dec_value);
1954 } else {
1955 __builtin_arm_wsr64("CNTV_TVAL_EL0", (uint64_t)dec_value);
1956 }
1957 }
1958
1959 /**
1960 * Reads from a non-speculative view of the timebase. If no such view exists on
1961 * this CPU, then an ISB is used to prevent speculation instead.
1962 *
1963 * @return the current value of the hardware timebase
1964 */
1965 static inline uint64_t
nonspeculative_timebase(void)1966 nonspeculative_timebase(void)
1967 {
1968 #if defined(HAS_ACNTVCT)
1969 return __builtin_arm_rsr64("ACNTVCT_EL0");
1970 #elif __ARM_ARCH_8_6__
1971 return __builtin_arm_rsr64("CNTVCTSS_EL0");
1972 #else
1973 // ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
1974 // "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
1975 // to other instructions executed on the same processor."
1976 __builtin_arm_isb(ISB_SY);
1977 return __builtin_arm_rsr64("CNTVCT_EL0");
1978 #endif
1979 }
1980
1981
1982 uint64_t
ml_get_hwclock()1983 ml_get_hwclock()
1984 {
1985 uint64_t timebase = nonspeculative_timebase();
1986 return timebase;
1987 }
1988
1989 uint64_t
ml_get_timebase()1990 ml_get_timebase()
1991 {
1992 uint64_t clock, timebase;
1993
1994 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
1995 do {
1996 timebase = getCpuDatap()->cpu_base_timebase;
1997 os_compiler_barrier();
1998 clock = ml_get_hwclock();
1999 os_compiler_barrier();
2000 } while (getCpuDatap()->cpu_base_timebase != timebase);
2001
2002 return clock + timebase;
2003 }
2004
2005 /*
2006 * Get the speculative timebase without an ISB.
2007 */
2008 uint64_t
ml_get_speculative_timebase(void)2009 ml_get_speculative_timebase(void)
2010 {
2011 uint64_t clock, timebase;
2012
2013 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
2014 do {
2015 timebase = getCpuDatap()->cpu_base_timebase;
2016 os_compiler_barrier();
2017 clock = __builtin_arm_rsr64("CNTVCT_EL0");
2018
2019 os_compiler_barrier();
2020 } while (getCpuDatap()->cpu_base_timebase != timebase);
2021
2022 return clock + timebase;
2023 }
2024
2025 uint64_t
ml_get_timebase_entropy(void)2026 ml_get_timebase_entropy(void)
2027 {
2028 return ml_get_speculative_timebase();
2029 }
2030
2031 uint32_t
ml_get_decrementer(void)2032 ml_get_decrementer(void)
2033 {
2034 cpu_data_t *cdp = getCpuDatap();
2035 uint32_t dec;
2036
2037 assert(ml_get_interrupts_enabled() == FALSE);
2038
2039 if (cdp->cpu_get_decrementer_func) {
2040 dec = cdp->cpu_get_decrementer_func();
2041 } else {
2042 uint64_t wide_val;
2043
2044 wide_val = __builtin_arm_rsr64("CNTV_TVAL_EL0");
2045 dec = (uint32_t)wide_val;
2046 assert(wide_val == (uint64_t)dec);
2047 }
2048
2049 return dec;
2050 }
2051
2052 boolean_t
ml_get_timer_pending(void)2053 ml_get_timer_pending(void)
2054 {
2055 uint64_t cntv_ctl = __builtin_arm_rsr64("CNTV_CTL_EL0");
2056 return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
2057 }
2058
2059 __attribute__((noreturn))
2060 void
platform_syscall(arm_saved_state_t * state)2061 platform_syscall(arm_saved_state_t *state)
2062 {
2063 uint32_t code;
2064
2065 #define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2066
2067 code = (uint32_t)get_saved_state_reg(state, 3);
2068
2069 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_START,
2070 get_saved_state_reg(state, 0),
2071 get_saved_state_reg(state, 1),
2072 get_saved_state_reg(state, 2));
2073
2074 switch (code) {
2075 case 2:
2076 /* set cthread */
2077 platform_syscall_kprintf("set cthread self.\n");
2078 thread_set_cthread_self(get_saved_state_reg(state, 0));
2079 break;
2080 case 3:
2081 /* get cthread */
2082 platform_syscall_kprintf("get cthread self.\n");
2083 set_saved_state_reg(state, 0, thread_get_cthread_self());
2084 break;
2085 case 0: /* I-Cache flush (removed) */
2086 case 1: /* D-Cache flush (removed) */
2087 default:
2088 platform_syscall_kprintf("unknown: %d\n", code);
2089 break;
2090 }
2091
2092 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_END,
2093 get_saved_state_reg(state, 0));
2094
2095 thread_exception_return();
2096 }
2097
2098 static void
_enable_timebase_event_stream(uint32_t bit_index)2099 _enable_timebase_event_stream(uint32_t bit_index)
2100 {
2101 uint64_t cntkctl; /* One wants to use 32 bits, but "mrs" prefers it this way */
2102
2103 if (bit_index >= 64) {
2104 panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2105 }
2106
2107 __asm__ volatile ("mrs %0, CNTKCTL_EL1" : "=r"(cntkctl));
2108
2109 cntkctl |= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2110 cntkctl |= CNTKCTL_EL1_EVNTEN;
2111 cntkctl |= CNTKCTL_EL1_EVENTDIR; /* 1->0; why not? */
2112
2113 /*
2114 * If the SOC supports it (and it isn't broken), enable
2115 * EL0 access to the timebase registers.
2116 */
2117 if (user_timebase_type() != USER_TIMEBASE_NONE) {
2118 cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2119 }
2120
2121 __builtin_arm_wsr64("CNTKCTL_EL1", cntkctl);
2122 }
2123
2124 /*
2125 * Turn timer on, unmask that interrupt.
2126 */
2127 static void
_enable_virtual_timer(void)2128 _enable_virtual_timer(void)
2129 {
2130 uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
2131
2132 __builtin_arm_wsr64("CNTV_CTL_EL0", cntvctl);
2133 /* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
2134 __builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2135 }
2136
2137 void
fiq_context_init(boolean_t enable_fiq __unused)2138 fiq_context_init(boolean_t enable_fiq __unused)
2139 {
2140 /* Interrupts still disabled. */
2141 assert(ml_get_interrupts_enabled() == FALSE);
2142 _enable_virtual_timer();
2143 }
2144
2145 void
wfe_timeout_init(void)2146 wfe_timeout_init(void)
2147 {
2148 _enable_timebase_event_stream(arm64_eventi);
2149 }
2150
2151 /**
2152 * Configures, but does not enable, the WFE event stream. The event stream
2153 * generates an event at a set interval to act as a timeout for WFEs.
2154 *
2155 * This function sets the static global variable arm64_eventi to be the proper
2156 * bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2157 * period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2158 * is used by wfe_timeout_init to actually poke the registers and enable the
2159 * event stream.
2160 *
2161 * The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2162 * is the trigger for the system to generate an event. The trigger can occur on
2163 * either the rising or falling edge of the bit depending on the value of
2164 * CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2165 * falling edge (1->0) transition to generate events.
2166 */
2167 void
wfe_timeout_configure(void)2168 wfe_timeout_configure(void)
2169 {
2170 /* Could fill in our own ops here, if we needed them */
2171 uint64_t ticks_per_sec, ticks_per_event, events_per_sec = 0;
2172 uint32_t bit_index;
2173
2174 if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
2175 if (events_per_sec <= 0) {
2176 events_per_sec = 1;
2177 } else if (events_per_sec > USEC_PER_SEC) {
2178 events_per_sec = USEC_PER_SEC;
2179 }
2180 } else {
2181 events_per_sec = USEC_PER_SEC;
2182 }
2183 ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2184 ticks_per_event = ticks_per_sec / events_per_sec;
2185
2186 /* Bit index of next power of two greater than ticks_per_event */
2187 bit_index = flsll(ticks_per_event) - 1;
2188 /* Round up to next power of two if ticks_per_event is initially power of two */
2189 if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
2190 bit_index++;
2191 }
2192
2193 /*
2194 * The timer can only trigger on rising or falling edge, not both; we don't
2195 * care which we trigger on, but we do need to adjust which bit we are
2196 * interested in to account for this.
2197 *
2198 * In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2199 * falling edge of the given bit. Therefore, we must decrement the bit index
2200 * by one as when the bit before the one we care about makes a 1 -> 0
2201 * transition, the bit we care about makes a 0 -> 1 transition.
2202 *
2203 * For example if we want an event generated every 8 ticks (if we calculated
2204 * a bit_index of 3), we would want the event to be generated whenever the
2205 * lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2206 * see that the bit at index 2 makes a falling transition in this scenario,
2207 * so we would want EVENTI to be 2 instead of 3.
2208 */
2209 if (bit_index != 0) {
2210 bit_index--;
2211 }
2212
2213 arm64_eventi = bit_index;
2214 }
2215
2216 boolean_t
ml_delay_should_spin(uint64_t interval)2217 ml_delay_should_spin(uint64_t interval)
2218 {
2219 cpu_data_t *cdp = getCpuDatap();
2220
2221 if (cdp->cpu_idle_latency) {
2222 return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2223 } else {
2224 /*
2225 * Early boot, latency is unknown. Err on the side of blocking,
2226 * which should always be safe, even if slow
2227 */
2228 return FALSE;
2229 }
2230 }
2231
2232 boolean_t
ml_thread_is64bit(thread_t thread)2233 ml_thread_is64bit(thread_t thread)
2234 {
2235 return thread_is_64bit_addr(thread);
2236 }
2237
2238 void
ml_delay_on_yield(void)2239 ml_delay_on_yield(void)
2240 {
2241 #if DEVELOPMENT || DEBUG
2242 if (yield_delay_us) {
2243 delay(yield_delay_us);
2244 }
2245 #endif
2246 }
2247
2248 void
ml_timer_evaluate(void)2249 ml_timer_evaluate(void)
2250 {
2251 }
2252
2253 boolean_t
ml_timer_forced_evaluation(void)2254 ml_timer_forced_evaluation(void)
2255 {
2256 return FALSE;
2257 }
2258
2259 void
ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)2260 ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2261 {
2262 /*
2263 * For now: update the resource coalition stats of the
2264 * current thread's coalition
2265 */
2266 task_coalition_update_gpu_stats(current_task(), gpu_ns_delta);
2267 }
2268
2269 uint64_t
ml_gpu_stat(__unused thread_t t)2270 ml_gpu_stat(__unused thread_t t)
2271 {
2272 return 0;
2273 }
2274
2275 thread_t
current_thread(void)2276 current_thread(void)
2277 {
2278 return current_thread_fast();
2279 }
2280
2281 typedef struct{
2282 ex_cb_t cb;
2283 void *refcon;
2284 }
2285 ex_cb_info_t;
2286
2287 ex_cb_info_t ex_cb_info[EXCB_CLASS_MAX];
2288
2289 /*
2290 * Callback registration
2291 * Currently we support only one registered callback per class but
2292 * it should be possible to support more callbacks
2293 */
2294 kern_return_t
ex_cb_register(ex_cb_class_t cb_class,ex_cb_t cb,void * refcon)2295 ex_cb_register(
2296 ex_cb_class_t cb_class,
2297 ex_cb_t cb,
2298 void *refcon)
2299 {
2300 ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2301
2302 if ((NULL == cb) || (cb_class >= EXCB_CLASS_MAX)) {
2303 return KERN_INVALID_VALUE;
2304 }
2305
2306 if (NULL == pInfo->cb) {
2307 pInfo->cb = cb;
2308 pInfo->refcon = refcon;
2309 return KERN_SUCCESS;
2310 }
2311 return KERN_FAILURE;
2312 }
2313
2314 /*
2315 * Called internally by platform kernel to invoke the registered callback for class
2316 */
2317 ex_cb_action_t
ex_cb_invoke(ex_cb_class_t cb_class,vm_offset_t far)2318 ex_cb_invoke(
2319 ex_cb_class_t cb_class,
2320 vm_offset_t far)
2321 {
2322 ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2323 ex_cb_state_t state = {far};
2324
2325 if (cb_class >= EXCB_CLASS_MAX) {
2326 panic("Invalid exception callback class 0x%x", cb_class);
2327 }
2328
2329 if (pInfo->cb) {
2330 return pInfo->cb(cb_class, pInfo->refcon, &state);
2331 }
2332 return EXCB_ACTION_NONE;
2333 }
2334
2335 #if defined(HAS_APPLE_PAC)
2336 void
ml_task_set_disable_user_jop(task_t task,uint8_t disable_user_jop)2337 ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2338 {
2339 assert(task);
2340 task->disable_user_jop = disable_user_jop;
2341 }
2342
2343 void
ml_thread_set_disable_user_jop(thread_t thread,uint8_t disable_user_jop)2344 ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2345 {
2346 assert(thread);
2347 if (disable_user_jop) {
2348 thread->machine.arm_machine_flags |= ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2349 } else {
2350 thread->machine.arm_machine_flags &= ~ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2351 }
2352 }
2353
2354 void
ml_task_set_rop_pid(task_t task,task_t parent_task,boolean_t inherit)2355 ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2356 {
2357 if (inherit) {
2358 task->rop_pid = parent_task->rop_pid;
2359 } else {
2360 task->rop_pid = early_random();
2361 }
2362 }
2363
2364 /**
2365 * jop_pid may be inherited from the parent task or generated inside the shared
2366 * region. Unfortunately these two parameters are available at very different
2367 * times during task creation, so we need to split this into two steps.
2368 */
2369 void
ml_task_set_jop_pid(task_t task,task_t parent_task,boolean_t inherit)2370 ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit)
2371 {
2372 if (inherit) {
2373 task->jop_pid = parent_task->jop_pid;
2374 } else {
2375 task->jop_pid = ml_default_jop_pid();
2376 }
2377 }
2378
2379 void
ml_task_set_jop_pid_from_shared_region(task_t task)2380 ml_task_set_jop_pid_from_shared_region(task_t task)
2381 {
2382 vm_shared_region_t sr = vm_shared_region_get(task);
2383 /*
2384 * If there's no shared region, we can assign the key arbitrarily. This
2385 * typically happens when Mach-O image activation failed part of the way
2386 * through, and this task is in the middle of dying with SIGKILL anyway.
2387 */
2388 if (__improbable(!sr)) {
2389 task->jop_pid = early_random();
2390 return;
2391 }
2392 vm_shared_region_deallocate(sr);
2393
2394 /*
2395 * Similarly we have to worry about jetsam having killed the task and
2396 * already cleared the shared_region_id.
2397 */
2398 task_lock(task);
2399 if (task->shared_region_id != NULL) {
2400 task->jop_pid = shared_region_find_key(task->shared_region_id);
2401 } else {
2402 task->jop_pid = early_random();
2403 }
2404 task_unlock(task);
2405 }
2406
2407 void
ml_thread_set_jop_pid(thread_t thread,task_t task)2408 ml_thread_set_jop_pid(thread_t thread, task_t task)
2409 {
2410 thread->machine.jop_pid = task->jop_pid;
2411 }
2412 #endif /* defined(HAS_APPLE_PAC) */
2413
2414 #if defined(HAS_APPLE_PAC)
2415 #if __ARM_ARCH_8_6__ || APPLEVIRTUALPLATFORM
2416 /**
2417 * The ARMv8.6 implementation is also safe for non-FPAC CPUs, but less efficient;
2418 * guest kernels need to use it because it does not know at compile time whether
2419 * the host CPU supports FPAC.
2420 */
2421
2422 /**
2423 * Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2424 */
2425 static void *
ml_poison_ptr(void * ptr,ptrauth_key key)2426 ml_poison_ptr(void *ptr, ptrauth_key key)
2427 {
2428 bool b_key = key & (1ULL << 0);
2429 uint64_t error_code;
2430 if (b_key) {
2431 error_code = 2;
2432 } else {
2433 error_code = 1;
2434 }
2435
2436 bool kernel_pointer = (uintptr_t)ptr & (1ULL << 55);
2437 bool data_key = key & (1ULL << 1);
2438 /* When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters */
2439 bool tbi = data_key && !kernel_pointer;
2440 unsigned int poison_shift;
2441 if (tbi) {
2442 poison_shift = 53;
2443 } else {
2444 poison_shift = 61;
2445 }
2446
2447 uintptr_t poisoned = (uintptr_t)ptr;
2448 poisoned &= ~(3ULL << poison_shift);
2449 poisoned |= error_code << poison_shift;
2450 return (void *)poisoned;
2451 }
2452
2453 /*
2454 * ptrauth_sign_unauthenticated() reimplemented using asm volatile, forcing the
2455 * compiler to assume this operation has side-effects and cannot be reordered
2456 */
2457 #define ptrauth_sign_volatile(__value, __suffix, __data) \
2458 ({ \
2459 void *__ret = __value; \
2460 asm volatile ( \
2461 "pac" #__suffix " %[value], %[data]" \
2462 : [value] "+r"(__ret) \
2463 : [data] "r"(__data) \
2464 ); \
2465 __ret; \
2466 })
2467
2468 #define ml_auth_ptr_unchecked_for_key(_ptr, _suffix, _key, _modifier) \
2469 do { \
2470 void *stripped = ptrauth_strip(_ptr, _key); \
2471 void *reauthed = ptrauth_sign_volatile(stripped, _suffix, _modifier); \
2472 if (__probable(_ptr == reauthed)) { \
2473 _ptr = stripped; \
2474 } else { \
2475 _ptr = ml_poison_ptr(stripped, _key); \
2476 } \
2477 } while (0)
2478
2479 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2480 ml_auth_ptr_unchecked_for_key(_ptr, _suffix, ptrauth_key_as ## _suffix, _modifier)
2481 #else
2482 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2483 asm volatile ("aut" #_suffix " %[ptr], %[modifier]" : [ptr] "+r"(_ptr) : [modifier] "r"(_modifier));
2484 #endif /* __ARM_ARCH_8_6__ || APPLEVIRTUALPLATFORM */
2485
2486 /**
2487 * Authenticates a signed pointer without trapping on failure.
2488 *
2489 * @warning This function must be called with interrupts disabled.
2490 *
2491 * @warning Pointer authentication failure should normally be treated as a fatal
2492 * error. This function is intended for a handful of callers that cannot panic
2493 * on failure, and that understand the risks in handling a poisoned return
2494 * value. Other code should generally use the trapping variant
2495 * ptrauth_auth_data() instead.
2496 *
2497 * @param ptr the pointer to authenticate
2498 * @param key which key to use for authentication
2499 * @param modifier a modifier to mix into the key
2500 * @return an authenticated version of ptr, possibly with poison bits set
2501 */
2502 void *
ml_auth_ptr_unchecked(void * ptr,ptrauth_key key,uint64_t modifier)2503 ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier)
2504 {
2505 switch (key & 0x3) {
2506 case ptrauth_key_asia:
2507 _ml_auth_ptr_unchecked(ptr, ia, modifier);
2508 break;
2509 case ptrauth_key_asib:
2510 _ml_auth_ptr_unchecked(ptr, ib, modifier);
2511 break;
2512 case ptrauth_key_asda:
2513 _ml_auth_ptr_unchecked(ptr, da, modifier);
2514 break;
2515 case ptrauth_key_asdb:
2516 _ml_auth_ptr_unchecked(ptr, db, modifier);
2517 break;
2518 }
2519
2520 return ptr;
2521 }
2522 #endif /* defined(HAS_APPLE_PAC) */
2523
2524 #ifdef CONFIG_XNUPOST
2525 void
ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_addr)2526 ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2527 {
2528 thread_t thread = current_thread();
2529 thread->machine.expected_fault_handler = expected_fault_handler;
2530 thread->machine.expected_fault_addr = expected_fault_addr;
2531 }
2532
2533 void
ml_expect_fault_end(void)2534 ml_expect_fault_end(void)
2535 {
2536 thread_t thread = current_thread();
2537 thread->machine.expected_fault_handler = NULL;
2538 thread->machine.expected_fault_addr = 0;
2539 }
2540 #endif /* CONFIG_XNUPOST */
2541
2542 void
ml_hibernate_active_pre(void)2543 ml_hibernate_active_pre(void)
2544 {
2545 #if HIBERNATION
2546 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2547
2548 hibernate_rebuild_vm_structs();
2549 }
2550 #endif /* HIBERNATION */
2551 }
2552
2553 void
ml_hibernate_active_post(void)2554 ml_hibernate_active_post(void)
2555 {
2556 #if HIBERNATION
2557 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2558 hibernate_machine_init();
2559 hibernate_vm_lock_end();
2560 current_cpu_datap()->cpu_hibernate = 0;
2561 }
2562 #endif /* HIBERNATION */
2563 }
2564
2565 /**
2566 * Return back a machine-dependent array of address space regions that should be
2567 * reserved by the VM (pre-mapped in the address space). This will prevent user
2568 * processes from allocating or deallocating from within these regions.
2569 *
2570 * @param vm_is64bit True if the process has a 64-bit address space.
2571 * @param regions An out parameter representing an array of regions to reserve.
2572 *
2573 * @return The number of reserved regions returned through `regions`.
2574 */
2575 size_t
ml_get_vm_reserved_regions(bool vm_is64bit,struct vm_reserved_region ** regions)2576 ml_get_vm_reserved_regions(bool vm_is64bit, struct vm_reserved_region **regions)
2577 {
2578 assert(regions != NULL);
2579
2580 /**
2581 * Reserved regions only apply to 64-bit address spaces. This is because
2582 * we only expect to grow the maximum user VA address on 64-bit address spaces
2583 * (we've essentially already reached the max for 32-bit spaces). The reserved
2584 * regions should safely fall outside of the max user VA for 32-bit processes.
2585 */
2586 if (vm_is64bit) {
2587 *regions = vm_reserved_regions;
2588 return ARRAY_COUNT(vm_reserved_regions);
2589 } else {
2590 /* Don't reserve any VA regions on arm64_32 processes. */
2591 *regions = NULL;
2592 return 0;
2593 }
2594 }
2595 /* These WFE recommendations are expected to be updated on a relatively
2596 * infrequent cadence, possibly from a different cluster, hence
2597 * false cacheline sharing isn't expected to be material
2598 */
2599 static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
2600
2601 uint32_t
ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id,uint64_t wfe_timeout_abstime_interval,__unused uint64_t wfe_hint_flags)2602 ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
2603 {
2604 assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
2605 assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
2606 os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
2607 return 0; /* Success */
2608 }
2609
2610 #if DEVELOPMENT || DEBUG
2611 int wfe_rec_max = 0;
2612 int wfe_rec_none = 0;
2613 uint64_t wfe_rec_override_mat = 0;
2614 uint64_t wfe_rec_clamp = 0;
2615 #endif
2616
2617 uint64_t
ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)2618 ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
2619 {
2620 /* This and its consumer does not synchronize vis-a-vis updates
2621 * of the recommendation; races are acceptable.
2622 */
2623 uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
2624 #if DEVELOPMENT || DEBUG
2625 if (wfe_rec_clamp) {
2626 wfet = MIN(wfe_rec_clamp, wfet);
2627 }
2628
2629 if (wfe_rec_max) {
2630 for (int i = 0; i < MAX_CPU_CLUSTERS; i++) {
2631 if (arm64_cluster_wfe_recs[i] > wfet) {
2632 wfet = arm64_cluster_wfe_recs[i];
2633 }
2634 }
2635 }
2636
2637 if (wfe_rec_none) {
2638 wfet = 0;
2639 }
2640
2641 if (wfe_rec_override_mat) {
2642 wfet = wfe_rec_override_mat;
2643 }
2644 #endif
2645 return wfet;
2646 }
2647