xref: /xnu-12377.1.9/osfmk/arm64/machine_routines.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <arm64/machine_machdep.h>
30 #include <arm64/proc_reg.h>
31 #include <arm/machine_cpu.h>
32 #include <arm/cpu_internal.h>
33 #include <arm/cpuid.h>
34 #include <arm/cpu_data.h>
35 #include <arm/cpu_data_internal.h>
36 #include <arm/caches_internal.h>
37 #include <arm/misc_protos.h>
38 #include <arm/machdep_call.h>
39 #include <arm/machine_routines.h>
40 #include <arm/rtclock.h>
41 #include <arm/cpuid_internal.h>
42 #include <arm/cpu_capabilities.h>
43 #include <console/serial_protos.h>
44 #include <kern/machine.h>
45 #include <kern/misc_protos.h>
46 #include <prng/random.h>
47 #include <kern/startup.h>
48 #include <kern/thread.h>
49 #include <kern/timer_queue.h>
50 #include <mach/machine.h>
51 #include <machine/atomic.h>
52 #include <machine/config.h>
53 #include <vm/pmap.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_page_internal.h>
56 #include <vm/vm_pageout_xnu.h>
57 #include <vm/vm_shared_region_xnu.h>
58 #include <vm/vm_map_xnu.h>
59 #include <vm/vm_kern_xnu.h>
60 #include <sys/codesign.h>
61 #include <sys/kdebug.h>
62 #include <kern/coalition.h>
63 #include <pexpert/device_tree.h>
64 #include <pexpert/arm64/board_config.h>
65 
66 #include <IOKit/IOPlatformExpert.h>
67 #if HIBERNATION
68 #include <IOKit/IOHibernatePrivate.h>
69 #endif /* HIBERNATION */
70 
71 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
72 #include <arm64/amcc_rorgn.h>
73 #endif
74 
75 
76 #if CONFIG_SPTM
77 #include <arm64/sptm/sptm.h>
78 #endif /* CONFIG_SPTM */
79 
80 #include <libkern/OSAtomic.h>
81 #include <libkern/section_keywords.h>
82 
83 /**
84  * On supported hardware, debuggable builds make the HID bits read-only
85  * without locking them.  This lets people manually modify HID bits while
86  * debugging, since they can use a debugging tool to first reset the HID
87  * bits back to read/write.  However it will still catch xnu changes that
88  * accidentally write to HID bits after they've been made read-only.
89  */
90 SECURITY_READ_ONLY_LATE(bool) skip_spr_lockdown_glb = 0;
91 
92 /*
93  * On some SoCs, PIO lockdown is applied in assembly in early boot by
94  * secondary CPUs.
95  * Since the cluster_pio_ro_ctl value is dynamic, it is stored here by the
96  * primary CPU so that it doesn't have to be computed each time by the
97  * startup code.
98  */
99 SECURITY_READ_ONLY_LATE(uint64_t) cluster_pio_ro_ctl_mask_glb = 0;
100 
101 #if CONFIG_CPU_COUNTERS
102 #include <kern/kpc.h>
103 #endif /* CONFIG_CPU_COUNTERS */
104 
105 #define MPIDR_CPU_ID(mpidr_el1_val)             (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
106 #define MPIDR_CLUSTER_ID(mpidr_el1_val)         (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
107 
108 #if HAS_CLUSTER
109 static uint8_t cluster_initialized = 0;
110 #endif
111 
112 MACHINE_TIMEOUT_DEV_WRITEABLE(LockTimeOut, "lock", 6e6 /* 0.25s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
113 machine_timeout_t LockTimeOutUsec; // computed in ml_init_lock_timeout
114 
115 MACHINE_TIMEOUT_DEV_WRITEABLE(TLockTimeOut, "ticket-lock", 3e6 /* 0.125s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
116 
117 TUNABLE_DEV_WRITEABLE(uint64_t, MutexSpin, "mutex-spin", 240 /* 10us */);
118 
119 uint64_t low_MutexSpin;
120 int64_t high_MutexSpin;
121 
122 
123 
124 static uint64_t ml_wfe_hint_max_interval;
125 #define MAX_WFE_HINT_INTERVAL_US (500ULL)
126 
127 /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
128 TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
129 
130 extern vm_offset_t   segLOWEST;
131 extern vm_offset_t   segLOWESTTEXT;
132 extern vm_offset_t   segLASTB;
133 extern unsigned long segSizeLAST;
134 
135 /* ARM64 specific bounds; used to test for presence in the kernelcache. */
136 extern vm_offset_t   vm_kernelcache_base;
137 extern vm_offset_t   vm_kernelcache_top;
138 
139 /* Location of the physmap / physical aperture */
140 extern uint64_t physmap_base;
141 
142 #if defined(CONFIG_SPTM)
143 extern const arm_physrange_t *arm_vm_kernelcache_ranges;
144 extern int arm_vm_kernelcache_numranges;
145 #else /* defined(CONFIG_SPTM) */
146 extern vm_offset_t arm_vm_kernelcache_phys_start;
147 extern vm_offset_t arm_vm_kernelcache_phys_end;
148 #endif /* defined(CONFIG_SPTM) */
149 
150 #if defined(HAS_IPI)
151 unsigned int gFastIPI = 1;
152 #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
153 static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
154     kDeferredIPITimerDefault);
155 #endif /* defined(HAS_IPI) */
156 
157 thread_t Idle_context(void);
158 
159 SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
160 
161 SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
162 SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
163 SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
164 	.version = CPU_TOPOLOGY_VERSION,
165 	.cpus = topology_cpu_array,
166 	.clusters = topology_cluster_array,
167 };
168 
169 _Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
170 
171 /**
172  * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
173  * entries of an arbitrary data type.  This is intended for use by specialized consumers
174  * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
175  * as follows:
176  *	hypothetical_array[cluster_offsets[AFF1] + AFF0]
177  * Most consumers should instead use general-purpose facilities such as PERCPU or
178  * ml_get_cpu_number().
179  */
180 SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
181 
182 SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
183 
184 extern uint32_t lockdown_done;
185 
186 /**
187  * Represents regions of virtual address space that should be reserved
188  * (pre-mapped) in each user address space.
189  */
190 static const struct vm_reserved_region vm_reserved_regions[] = {
191 	{
192 		.vmrr_name = "GPU Carveout",
193 		.vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
194 		.vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
195 	},
196 	/*
197 	 * Reserve the virtual memory space representing the commpage nesting region
198 	 * to prevent user processes from allocating memory within it. The actual
199 	 * page table entries for the commpage are inserted by vm_commpage_enter().
200 	 * This vm_map_enter() just prevents userspace from allocating/deallocating
201 	 * anything within the entire commpage nested region.
202 	 */
203 	{
204 		.vmrr_name = "commpage nesting",
205 		.vmrr_addr = _COMM_PAGE64_NESTING_START,
206 		.vmrr_size = _COMM_PAGE64_NESTING_SIZE
207 	}
208 };
209 
210 uint32_t get_arm_cpu_version(void);
211 
212 
213 #if defined(HAS_IPI)
214 static inline void
ml_cpu_signal_type(unsigned int cpu_mpidr,uint32_t type)215 ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
216 {
217 #if HAS_CLUSTER
218 	uint64_t local_mpidr;
219 	/* NOTE: this logic expects that we are called in a non-preemptible
220 	 * context, or at least one in which the calling thread is bound
221 	 * to a single CPU.  Otherwise we may migrate between choosing which
222 	 * IPI mechanism to use and issuing the IPI. */
223 	MRS(local_mpidr, "MPIDR_EL1");
224 	if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
225 		uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
226 		MSR("S3_5_C15_C0_0", x);
227 	} else {
228 		#define IPI_RR_TARGET_CLUSTER_SHIFT 16
229 		uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
230 		MSR("S3_5_C15_C0_1", x);
231 	}
232 #else
233 	uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
234 	MSR("S3_5_C15_C0_1", x);
235 #endif
236 	/* The recommended local/global IPI sequence is:
237 	 *   DSB <sys> (This ensures visibility of e.g. older stores to the
238 	 *     pending CPU signals bit vector in DRAM prior to IPI reception,
239 	 *     and is present in cpu_signal_internal())
240 	 *   MSR S3_5_C15_C0_1, Xt
241 	 *   ISB
242 	 */
243 	__builtin_arm_isb(ISB_SY);
244 }
245 #endif
246 
247 #if !defined(HAS_IPI)
248 __dead2
249 #endif
250 void
ml_cpu_signal(unsigned int cpu_mpidr __unused)251 ml_cpu_signal(unsigned int cpu_mpidr __unused)
252 {
253 #if defined(HAS_IPI)
254 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
255 #else
256 	panic("Platform does not support ACC Fast IPI");
257 #endif
258 }
259 
260 #if !defined(HAS_IPI)
261 __dead2
262 #endif
263 void
ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)264 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
265 {
266 #if defined(HAS_IPI)
267 	/* adjust IPI_CR timer countdown value for deferred IPI
268 	 * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
269 	 * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
270 	 *
271 	 * global register, should only require a single write to update all
272 	 * CPU cores: from Skye ACC user spec section 5.7.3.3
273 	 *
274 	 * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
275 	 * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
276 	 */
277 	uint64_t abstime;
278 
279 	nanoseconds_to_absolutetime(nanosecs, &abstime);
280 
281 	abstime = MIN(abstime, 0xFFFF);
282 
283 	/* update deferred_ipi_timer_ns with the new clamped value */
284 	absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
285 
286 	MSR("S3_5_C15_C3_1", abstime);
287 #else
288 	(void)nanosecs;
289 	panic("Platform does not support ACC Fast IPI");
290 #endif
291 }
292 
293 uint64_t
ml_cpu_signal_deferred_get_timer()294 ml_cpu_signal_deferred_get_timer()
295 {
296 #if defined(HAS_IPI)
297 	return deferred_ipi_timer_ns;
298 #else
299 	return 0;
300 #endif
301 }
302 
303 #if !defined(HAS_IPI)
304 __dead2
305 #endif
306 void
ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)307 ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
308 {
309 #if defined(HAS_IPI)
310 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
311 #else
312 	panic("Platform does not support ACC Fast IPI deferral");
313 #endif
314 }
315 
316 #if !defined(HAS_IPI)
317 __dead2
318 #endif
319 void
ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)320 ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
321 {
322 #if defined(HAS_IPI)
323 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
324 #else
325 	panic("Platform does not support ACC Fast IPI retraction");
326 #endif
327 }
328 
329 extern uint32_t idle_proximate_io_wfe_unmasked;
330 
331 #define CPUPM_IDLE_WFE 0x5310300
332 static bool
wfe_process_recommendation(void)333 wfe_process_recommendation(void)
334 {
335 	bool ipending = false;
336 	if (__probable(idle_proximate_io_wfe_unmasked == 1)) {
337 		/* Check for an active perf. controller generated
338 		 * WFE recommendation for this cluster.
339 		 */
340 		cpu_data_t *cdp = getCpuDatap();
341 		uint32_t cid = cdp->cpu_cluster_id;
342 		uint64_t wfe_ttd = 0;
343 		uint64_t wfe_deadline = 0;
344 
345 		if ((wfe_ttd = ml_cluster_wfe_timeout(cid)) != 0) {
346 			wfe_deadline = mach_absolute_time() + wfe_ttd;
347 		}
348 
349 		if (wfe_deadline != 0) {
350 			/* Poll issuing event-bounded WFEs until an interrupt
351 			 * arrives or the WFE recommendation expires
352 			 */
353 #if DEVELOPMENT || DEBUG
354 			uint64_t wc = cdp->wfe_count;
355 			KDBG(CPUPM_IDLE_WFE | DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
356 #endif
357 			/* Issue WFE until the recommendation expires,
358 			 * with IRQs unmasked.
359 			 */
360 			ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true, true);
361 #if DEVELOPMENT || DEBUG
362 			KDBG(CPUPM_IDLE_WFE | DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
363 #endif
364 		}
365 	}
366 	return ipending;
367 }
368 
369 void
machine_idle(void)370 machine_idle(void)
371 {
372 	/* Interrupts are expected to be masked on entry or re-entry via
373 	 * Idle_load_context()
374 	 */
375 	assert((__builtin_arm_rsr("DAIF") & DAIF_STANDARD_DISABLE) == DAIF_STANDARD_DISABLE);
376 	/* Check for, and act on, a WFE recommendation.
377 	 * Bypasses context spill/fill for a minor perf. increment.
378 	 * May unmask and restore IRQ+FIQ mask.
379 	 */
380 	if (wfe_process_recommendation() == false) {
381 		/* If WFE recommendation absent, or WFE deadline
382 		 * arrived with no interrupt pending/processed,
383 		 * fall back to WFI.
384 		 */
385 		Idle_context();
386 	}
387 	__builtin_arm_wsr("DAIFClr", DAIFSC_STANDARD_DISABLE);
388 }
389 
390 void
OSSynchronizeIO(void)391 OSSynchronizeIO(void)
392 {
393 	__builtin_arm_dsb(DSB_SY);
394 }
395 
396 uint64_t
get_aux_control(void)397 get_aux_control(void)
398 {
399 	uint64_t        value;
400 
401 	MRS(value, "ACTLR_EL1");
402 	return value;
403 }
404 
405 uint64_t
get_mmu_control(void)406 get_mmu_control(void)
407 {
408 	uint64_t        value;
409 
410 	MRS(value, "SCTLR_EL1");
411 	return value;
412 }
413 
414 uint64_t
get_tcr(void)415 get_tcr(void)
416 {
417 	uint64_t        value;
418 
419 	MRS(value, "TCR_EL1");
420 	return value;
421 }
422 
423 __mockable boolean_t
ml_get_interrupts_enabled(void)424 ml_get_interrupts_enabled(void)
425 {
426 	uint64_t        value;
427 
428 	MRS(value, "DAIF");
429 	if ((value & DAIF_STANDARD_DISABLE) == DAIF_STANDARD_DISABLE) {
430 		return FALSE;
431 	}
432 	return TRUE;
433 }
434 
435 pmap_paddr_t
get_mmu_ttb(void)436 get_mmu_ttb(void)
437 {
438 	pmap_paddr_t    value;
439 
440 	MRS(value, "TTBR0_EL1");
441 	return value;
442 }
443 
444 MARK_AS_FIXUP_TEXT uint32_t
get_arm_cpu_version(void)445 get_arm_cpu_version(void)
446 {
447 	uint32_t value = machine_read_midr();
448 
449 	/* Compose the register values into 8 bits; variant[7:4], revision[3:0]. */
450 	return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
451 }
452 
453 bool
ml_feature_supported(uint64_t feature_bit)454 ml_feature_supported(uint64_t feature_bit)
455 {
456 	uint64_t aidr_el1_value = 0;
457 
458 	MRS(aidr_el1_value, "AIDR_EL1");
459 
460 #ifdef APPLEAVALANCHE
461 #endif // APPLEAVALANCHE
462 
463 	return aidr_el1_value & feature_bit;
464 }
465 
466 /*
467  * user_cont_hwclock_allowed()
468  *
469  * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
470  * as a continuous time source (e.g. from mach_continuous_time)
471  */
472 boolean_t
user_cont_hwclock_allowed(void)473 user_cont_hwclock_allowed(void)
474 {
475 #if HAS_CONTINUOUS_HWCLOCK
476 	return TRUE;
477 #else
478 	return FALSE;
479 #endif
480 }
481 
482 /*
483  * user_timebase_type()
484  *
485  * Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
486  *
487  * USER_TIMEBASE_NONE: EL0 has no access to timebase register
488  * USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
489  * USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
490  *
491  */
492 
493 uint8_t
user_timebase_type(void)494 user_timebase_type(void)
495 {
496 #if HAS_ACNTVCT
497 	return USER_TIMEBASE_NOSPEC_APPLE;
498 #elif HAS_APPLE_GENERIC_TIMER
499 	// Conveniently, S3_4_C15_C10_6 and ACNTVCT_EL0 have identical encodings
500 	return USER_TIMEBASE_NOSPEC_APPLE;
501 #elif __ARM_ARCH_8_6__
502 	return USER_TIMEBASE_NOSPEC;
503 #else
504 	return USER_TIMEBASE_SPEC;
505 #endif
506 }
507 
508 void
machine_startup(__unused boot_args * args)509 machine_startup(__unused boot_args * args)
510 {
511 #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
512 	if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
513 		gFastIPI = 1;
514 	}
515 #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
516 
517 
518 	machine_conf();
519 
520 
521 	/*
522 	 * Kick off the kernel bootstrap.
523 	 */
524 	kernel_bootstrap();
525 	/* NOTREACHED */
526 }
527 
528 typedef void (*invalidate_fn_t)(void);
529 
530 static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
531 
532 void set_invalidate_hmac_function(invalidate_fn_t fn);
533 
534 void
set_invalidate_hmac_function(invalidate_fn_t fn)535 set_invalidate_hmac_function(invalidate_fn_t fn)
536 {
537 	if (NULL != invalidate_hmac_function) {
538 		panic("Invalidate HMAC function already set");
539 	}
540 
541 	invalidate_hmac_function = fn;
542 }
543 
544 bool
ml_is_secure_hib_supported(void)545 ml_is_secure_hib_supported(void)
546 {
547 	return false;
548 }
549 
550 static void ml_release_deferred_pages(void);
551 
552 void
machine_lockdown(void)553 machine_lockdown(void)
554 {
555 
556 #if CONFIG_SPTM
557 
558 	/**
559 	 * On devices that make use of the SPTM, the SPTM is responsible for
560 	 * managing system register locks. Due to this, we skip the call to
561 	 * spr_lockdown() below.
562 	 */
563 #else
564 #endif
565 
566 	arm_vm_prot_finalize(PE_state.bootArgs);
567 	ml_release_deferred_pages();
568 
569 #if CONFIG_KERNEL_INTEGRITY
570 #if KERNEL_INTEGRITY_WT
571 	/* Watchtower
572 	 *
573 	 * Notify the monitor about the completion of early kernel bootstrap.
574 	 * From this point forward it will enforce the integrity of kernel text,
575 	 * rodata and page tables.
576 	 */
577 
578 #ifdef MONITOR
579 	monitor_call(MONITOR_LOCKDOWN, 0, 0, 0);
580 #endif
581 #endif /* KERNEL_INTEGRITY_WT */
582 
583 #if CONFIG_SPTM
584 	extern void pmap_prepare_commpages(void);
585 	pmap_prepare_commpages();
586 
587 	/**
588 	 * sptm_lockdown_xnu() disables preemption like all SPTM calls, but may take
589 	 * a fair amount of time as it involves retyping a large number of pages.
590 	 * This preemption latency is not really a concern since we're still fairly
591 	 * early in the boot process, so just explicitly disable preemption before
592 	 * invoking the SPTM and abandon preemption latency measurements before
593 	 * re-enabling it.
594 	 */
595 	disable_preemption();
596 	/* Signal the SPTM that XNU is ready for RO memory to actually become read-only */
597 	sptm_lockdown_xnu();
598 #if SCHED_HYGIENE_DEBUG
599 	abandon_preemption_disable_measurement();
600 #endif /* SCHED_HYGIENE_DEBUG */
601 	enable_preemption();
602 #else
603 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
604 	/* KTRR
605 	 *
606 	 * Lock physical KTRR region. KTRR region is read-only. Memory outside
607 	 * the region is not executable at EL1.
608 	 */
609 
610 	rorgn_lockdown();
611 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
612 #endif /* CONFIG_SPTM */
613 
614 #if XNU_MONITOR
615 	pmap_lockdown_ppl();
616 #endif
617 
618 #endif /* CONFIG_KERNEL_INTEGRITY */
619 
620 
621 	/**
622 	 * For platforms that use SEP-backed hibernation, invoke kext-provided
623 	 * functionality to invalidate HMAC key in SIO used to sign a variety of
624 	 * data (e.g., the RO region).
625 	 *
626 	 * Just for paranoia's sake, let's make it so that if an attacker is
627 	 * capable of corrupting EDT early that they have to do so in a way that
628 	 * prevents invaldidate_hmac_function from running properly yet still
629 	 * makes it so that the invalidate HMAC function receives an OK
630 	 * response, which seems hard.
631 	 *
632 	 * This only makes sense for PPL-based systems seeing as SPTM-based systems
633 	 * will have iBoot invalidate Key1 for us.
634 	 */
635 	if (NULL != invalidate_hmac_function) {
636 #if !defined(CONFIG_SPTM)
637 		invalidate_hmac_function();
638 #endif /* !defined(CONFIG_SPTM) */
639 	}
640 
641 	lockdown_done = 1;
642 }
643 
644 
645 char           *
machine_boot_info(__unused char * buf,__unused vm_size_t size)646 machine_boot_info(
647 	__unused char *buf,
648 	__unused vm_size_t size)
649 {
650 	return PE_boot_args();
651 }
652 
653 void
machine_cpu_reinit(__unused void * param)654 machine_cpu_reinit(__unused void *param)
655 {
656 	cpu_machine_init();     /* Initialize the processor */
657 	clock_init();           /* Init the clock */
658 }
659 
660 /*
661  *	Routine:        machine_processor_shutdown
662  *	Function:
663  */
664 thread_t
machine_processor_shutdown(__unused thread_t thread,void (* doshutdown)(processor_t),processor_t processor)665 machine_processor_shutdown(
666 	__unused thread_t thread,
667 	void (*doshutdown)(processor_t),
668 	processor_t processor)
669 {
670 	return Shutdown_context(doshutdown, processor);
671 }
672 
673 /*
674  *      Routine:        ml_init_lock_timeout
675  *      Function:
676  */
677 static void __startup_func
ml_init_lock_timeout(void)678 ml_init_lock_timeout(void)
679 {
680 	/*
681 	 * This function is called after STARTUP_SUB_TIMEOUTS
682 	 * initialization, so using the "legacy" boot-args here overrides
683 	 * the ml-timeout-...  configuration. (Given that these boot-args
684 	 * here are usually explicitly specified, this makes sense by
685 	 * overriding ml-timeout-..., which may come from the device tree.
686 	 */
687 
688 	uint64_t lto_timeout_ns;
689 	uint64_t lto_abstime;
690 	uint32_t slto;
691 
692 	if (PE_parse_boot_argn("slto_us", &slto, sizeof(slto))) {
693 		lto_timeout_ns = slto * NSEC_PER_USEC;
694 		nanoseconds_to_absolutetime(lto_timeout_ns, &lto_abstime);
695 		os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
696 	} else {
697 		lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
698 		absolutetime_to_nanoseconds(lto_abstime, &lto_timeout_ns);
699 	}
700 
701 	os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
702 
703 	if (PE_parse_boot_argn("tlto_us", &slto, sizeof(slto))) {
704 		nanoseconds_to_absolutetime(slto * NSEC_PER_USEC, &lto_abstime);
705 		os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
706 	} else if (lto_abstime != 0) {
707 		os_atomic_store(&TLockTimeOut, lto_abstime >> 1, relaxed);
708 	} // else take default from MACHINE_TIMEOUT.
709 
710 	uint64_t mtxspin;
711 	uint64_t mtx_abstime;
712 	if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof(mtxspin))) {
713 		if (mtxspin > USEC_PER_SEC >> 4) {
714 			mtxspin =  USEC_PER_SEC >> 4;
715 		}
716 		nanoseconds_to_absolutetime(mtxspin * NSEC_PER_USEC, &mtx_abstime);
717 		os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
718 	} else {
719 		mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
720 	}
721 
722 	low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
723 	/*
724 	 * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
725 	 * real_ncpus is not set at this time
726 	 *
727 	 * NOTE: active spinning is disabled in arm. It can be activated
728 	 * by setting high_MutexSpin through the sysctl.
729 	 */
730 	high_MutexSpin = low_MutexSpin;
731 
732 	uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
733 	PE_parse_boot_argn("max_wfe_us", &maxwfeus, sizeof(maxwfeus));
734 	nanoseconds_to_absolutetime(maxwfeus * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
735 }
736 STARTUP(TIMEOUTS, STARTUP_RANK_MIDDLE, ml_init_lock_timeout);
737 
738 
739 /*
740  * This is called when all of the ml_processor_info_t structures have been
741  * initialized and all the processors have been started through processor_boot().
742  *
743  * Required by the scheduler subsystem.
744  */
745 void
ml_cpu_init_completed(void)746 ml_cpu_init_completed(void)
747 {
748 	sched_cpu_init_completed();
749 }
750 
751 /*
752  * This tracks which cpus are between ml_cpu_down and ml_cpu_up
753  */
754 _Atomic uint64_t ml_cpu_up_processors = 0;
755 
756 void
ml_cpu_up(void)757 ml_cpu_up(void)
758 {
759 	cpu_data_t *cpu_data_ptr = getCpuDatap();
760 
761 	assert(!bit_test(os_atomic_load(&ml_cpu_up_processors, relaxed), cpu_data_ptr->cpu_number));
762 
763 	atomic_bit_set(&ml_cpu_up_processors, cpu_data_ptr->cpu_number, memory_order_relaxed);
764 }
765 
766 /*
767  * These are called from the machine-independent routine cpu_up()
768  * to perform machine-dependent info updates.
769  *
770  * The update to CPU counts needs to be separate from other actions
771  * because we don't update the counts when CLPC causes temporary
772  * cluster powerdown events, as these must be transparent to the user.
773  */
774 
775 void
ml_cpu_up_update_counts(int cpu_id)776 ml_cpu_up_update_counts(int cpu_id)
777 {
778 	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
779 
780 	os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
781 
782 	os_atomic_inc(&machine_info.physical_cpu, relaxed);
783 	os_atomic_inc(&machine_info.logical_cpu, relaxed);
784 }
785 
786 int
ml_find_next_up_processor()787 ml_find_next_up_processor()
788 {
789 	if (BootCpuData.cpu_running) {
790 		return BootCpuData.cpu_number;
791 	}
792 
793 	int next_active_cpu = lsb_first(os_atomic_load(&ml_cpu_up_processors, relaxed));
794 
795 	if (next_active_cpu == -1) {
796 		assertf(ml_is_quiescing(), "can only have no active CPUs in quiesce state");
797 		next_active_cpu = BootCpuData.cpu_number;
798 	}
799 
800 	return next_active_cpu;
801 }
802 
803 /*
804  * These are called from the machine-independent routine cpu_down()
805  * to perform machine-dependent info updates.
806  *
807  * The update to CPU counts needs to be separate from other actions
808  * because we don't update the counts when CLPC causes temporary
809  * cluster powerdown events, as these must be transparent to the user.
810  */
811 void
ml_cpu_down(void)812 ml_cpu_down(void)
813 {
814 	/*
815 	 * If we want to deal with outstanding IPIs, we need to
816 	 * do relatively early in the processor_doshutdown path,
817 	 * as we pend decrementer interrupts using the IPI
818 	 * mechanism if we cannot immediately service them (if
819 	 * IRQ is masked).  Do so now.
820 	 *
821 	 * We aren't on the interrupt stack here; would it make
822 	 * more sense to disable signaling and then enable
823 	 * interrupts?  It might be a bit cleaner.
824 	 */
825 	cpu_data_t *cpu_data_ptr = getCpuDatap();
826 	cpu_data_ptr->cpu_running = FALSE;
827 
828 	assert((cpu_data_ptr->cpu_signal & SIGPdisabled) == 0);
829 	assert(bit_test(os_atomic_load(&ml_cpu_up_processors, relaxed), cpu_data_ptr->cpu_number));
830 
831 	atomic_bit_clear(&ml_cpu_up_processors, cpu_data_ptr->cpu_number, memory_order_release);
832 
833 	if (cpu_data_ptr == &BootCpuData && ml_is_quiescing()) {
834 		/*
835 		 * This is the boot CPU powering down for S2R, don't try to migrate its timers,
836 		 * because there is nobody else active to migrate it to.
837 		 */
838 		assert3u(os_atomic_load(&ml_cpu_up_processors, relaxed), ==, 0);
839 	} else if (cpu_data_ptr != &BootCpuData || (support_bootcpu_shutdown && !ml_is_quiescing())) {
840 		int next_cpu = ml_find_next_up_processor();
841 
842 		cpu_data_t* new_cpu_datap = cpu_datap(next_cpu);
843 
844 		/*
845 		 * Move all of this cpu's timers to another cpu that has not gone through ml_cpu_down,
846 		 * and poke it in case there's a sooner deadline for it to schedule.
847 		 *
848 		 * This depends on ml_cpu_down never running concurrently, which is guaranteed by
849 		 * the processor_updown_lock.
850 		 */
851 		timer_queue_shutdown(next_cpu, &cpu_data_ptr->rtclock_timer.queue,
852 		    &new_cpu_datap->rtclock_timer.queue);
853 
854 		/*
855 		 * Trigger timer_queue_expire_local to execute on the remote CPU.
856 		 *
857 		 * Because we have interrupts disabled here, we cannot use a
858 		 * standard cpu_xcall, which would deadlock against the stackshot
859 		 * IPI. This must be a fire-and-forget IPI.
860 		 */
861 		kern_return_t rv = cpu_signal(new_cpu_datap, SIGPTimerLocal, NULL, NULL);
862 
863 		if (rv != KERN_SUCCESS) {
864 			panic("ml_cpu_down: cpu_signal of cpu %d failure %d", next_cpu, rv);
865 		}
866 	} else {
867 		panic("boot cpu powering down with nowhere for its timers to go");
868 	}
869 
870 	cpu_signal_handler_internal(TRUE);
871 
872 	/* There should be no more pending IPIs on this core. */
873 	assert3u(getCpuDatap()->cpu_signal, ==, SIGPdisabled);
874 }
875 
876 void
ml_cpu_down_update_counts(int cpu_id)877 ml_cpu_down_update_counts(int cpu_id)
878 {
879 	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
880 
881 	os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
882 
883 	os_atomic_dec(&machine_info.physical_cpu, relaxed);
884 	os_atomic_dec(&machine_info.logical_cpu, relaxed);
885 }
886 
887 
888 unsigned int
ml_get_machine_mem(void)889 ml_get_machine_mem(void)
890 {
891 	return machine_info.memory_size;
892 }
893 
894 __attribute__((noreturn))
895 void
halt_all_cpus(boolean_t reboot)896 halt_all_cpus(boolean_t reboot)
897 {
898 	if (reboot) {
899 		printf("MACH Reboot\n");
900 		PEHaltRestart(kPERestartCPU);
901 	} else {
902 		printf("CPU halted\n");
903 		PEHaltRestart(kPEHaltCPU);
904 	}
905 	while (1) {
906 		;
907 	}
908 }
909 
910 __attribute__((noreturn))
911 void
halt_cpu(void)912 halt_cpu(void)
913 {
914 	halt_all_cpus(FALSE);
915 }
916 
917 /*
918  *	Routine:        machine_signal_idle
919  *	Function:
920  */
921 void
machine_signal_idle(processor_t processor)922 machine_signal_idle(
923 	processor_t processor)
924 {
925 	cpu_signal(processor_to_cpu_datap(processor), SIGPnop, (void *)NULL, (void *)NULL);
926 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
927 }
928 
929 void
machine_signal_idle_deferred(processor_t processor)930 machine_signal_idle_deferred(
931 	processor_t processor)
932 {
933 	cpu_signal_deferred(processor_to_cpu_datap(processor), SIGPdeferred);
934 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
935 }
936 
937 void
machine_signal_idle_cancel(processor_t processor)938 machine_signal_idle_cancel(
939 	processor_t processor)
940 {
941 	cpu_signal_cancel(processor_to_cpu_datap(processor), SIGPdeferred);
942 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
943 }
944 
945 /*
946  *	Routine:        ml_install_interrupt_handler
947  *	Function:	Initialize Interrupt Handler
948  */
949 void
ml_install_interrupt_handler(void * nub,int source,void * target,IOInterruptHandler handler,void * refCon)950 ml_install_interrupt_handler(
951 	void *nub,
952 	int source,
953 	void *target,
954 	IOInterruptHandler handler,
955 	void *refCon)
956 {
957 	cpu_data_t     *cpu_data_ptr;
958 	boolean_t       current_state;
959 
960 	current_state = ml_set_interrupts_enabled(FALSE);
961 	cpu_data_ptr = getCpuDatap();
962 
963 	cpu_data_ptr->interrupt_nub = nub;
964 	cpu_data_ptr->interrupt_source = source;
965 	cpu_data_ptr->interrupt_target = target;
966 	cpu_data_ptr->interrupt_handler = handler;
967 	cpu_data_ptr->interrupt_refCon = refCon;
968 
969 	(void) ml_set_interrupts_enabled(current_state);
970 }
971 
972 /*
973  *	Routine:        ml_init_interrupt
974  *	Function:	Initialize Interrupts
975  */
976 void
ml_init_interrupt(void)977 ml_init_interrupt(void)
978 {
979 #if defined(HAS_IPI)
980 	/*
981 	 * ml_init_interrupt will get called once for each CPU, but this is redundant
982 	 * because there is only one global copy of the register for skye. do it only
983 	 * on the bootstrap cpu
984 	 */
985 	if (getCpuDatap()->cluster_master) {
986 		ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
987 	}
988 #endif
989 }
990 
991 /*
992  *	Routine:        ml_init_timebase
993  *	Function:	register and setup Timebase, Decremeter services
994  */
995 void
ml_init_timebase(void * args,tbd_ops_t tbd_funcs,vm_offset_t int_address,vm_offset_t int_value __unused)996 ml_init_timebase(
997 	void            *args,
998 	tbd_ops_t       tbd_funcs,
999 	vm_offset_t     int_address,
1000 	vm_offset_t     int_value __unused)
1001 {
1002 	cpu_data_t     *cpu_data_ptr;
1003 
1004 	cpu_data_ptr = (cpu_data_t *)args;
1005 
1006 	if ((cpu_data_ptr == &BootCpuData)
1007 	    && (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
1008 		rtclock_timebase_func = *tbd_funcs;
1009 		rtclock_timebase_addr = int_address;
1010 	}
1011 }
1012 
1013 #define ML_READPROP_MANDATORY UINT64_MAX
1014 
1015 static uint64_t
ml_readprop(const DTEntry entry,const char * propertyName,uint64_t default_value)1016 ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
1017 {
1018 	void const *prop;
1019 	unsigned int propSize;
1020 
1021 	if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) {
1022 		if (propSize == sizeof(uint8_t)) {
1023 			return *((uint8_t const *)prop);
1024 		} else if (propSize == sizeof(uint16_t)) {
1025 			return *((uint16_t const *)prop);
1026 		} else if (propSize == sizeof(uint32_t)) {
1027 			return *((uint32_t const *)prop);
1028 		} else if (propSize == sizeof(uint64_t)) {
1029 			return *((uint64_t const *)prop);
1030 		} else {
1031 			panic("CPU property '%s' has bad size %u", propertyName, propSize);
1032 		}
1033 	} else {
1034 		if (default_value == ML_READPROP_MANDATORY) {
1035 			panic("Missing mandatory property '%s'", propertyName);
1036 		}
1037 		return default_value;
1038 	}
1039 }
1040 
1041 static boolean_t
ml_read_reg_range(const DTEntry entry,const char * propertyName,uint64_t * pa_ptr,uint64_t * len_ptr)1042 ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
1043 {
1044 	uint64_t const *prop;
1045 	unsigned int propSize;
1046 
1047 	if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) {
1048 		return FALSE;
1049 	}
1050 
1051 	if (propSize != sizeof(uint64_t) * 2) {
1052 		panic("Wrong property size for %s", propertyName);
1053 	}
1054 
1055 	*pa_ptr = prop[0];
1056 	*len_ptr = prop[1];
1057 	return TRUE;
1058 }
1059 
1060 static boolean_t
ml_is_boot_cpu(const DTEntry entry)1061 ml_is_boot_cpu(const DTEntry entry)
1062 {
1063 	void const *prop;
1064 	unsigned int propSize;
1065 
1066 	if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) {
1067 		panic("unable to retrieve state for cpu");
1068 	}
1069 
1070 	if (strncmp((char const *)prop, "running", propSize) == 0) {
1071 		return TRUE;
1072 	} else {
1073 		return FALSE;
1074 	}
1075 }
1076 
1077 static void
ml_cluster_power_override(unsigned int * flag)1078 ml_cluster_power_override(unsigned int *flag)
1079 {
1080 #if XNU_CLUSTER_POWER_DOWN
1081 	/*
1082 	 * Old method (H14/H15): enable CPD in the kernel build
1083 	 * For H16+, *flag may have be set to 1 through EDT
1084 	 */
1085 	*flag = 1;
1086 #endif
1087 
1088 	/*
1089 	 * If a boot-arg is set that allows threads to be bound
1090 	 * to a cpu or cluster, cluster_power_down must
1091 	 * default to false.
1092 	 */
1093 #ifdef CONFIG_XNUPOST
1094 	uint64_t kernel_post = 0;
1095 	PE_parse_boot_argn("kernPOST", &kernel_post, sizeof(kernel_post));
1096 	if (kernel_post != 0) {
1097 		*flag = 0;
1098 	}
1099 #endif
1100 	if (PE_parse_boot_argn("enable_skstb", NULL, 0)) {
1101 		*flag = 0;
1102 	}
1103 	if (PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
1104 		*flag = 0;
1105 	}
1106 
1107 	/* Always let the user manually override, even if it's unsupported */
1108 	PE_parse_boot_argn("cluster_power", flag, sizeof(*flag));
1109 }
1110 
1111 
1112 static void
ml_read_chip_revision(unsigned int * rev __unused)1113 ml_read_chip_revision(unsigned int *rev __unused)
1114 {
1115 	// The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
1116 #ifdef APPLE_ARM64_ARCH_FAMILY
1117 	DTEntry         entryP;
1118 
1119 	if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) {
1120 		*rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN);
1121 	} else {
1122 		*rev = CPU_VERSION_UNKNOWN;
1123 	}
1124 #endif
1125 }
1126 
1127 void
ml_parse_cpu_topology(void)1128 ml_parse_cpu_topology(void)
1129 {
1130 	DTEntry entry, child __unused;
1131 	OpaqueDTEntryIterator iter;
1132 	uint32_t cpu_boot_arg = MAX_CPUS;
1133 	uint64_t cpumask_boot_arg = ULLONG_MAX;
1134 	int err;
1135 
1136 	int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
1137 	int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
1138 	const boolean_t cpus_boot_arg_present = PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg));
1139 	const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn("cpumask", &cpumask_boot_arg, sizeof(cpumask_boot_arg));
1140 
1141 	// The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
1142 	// so that we trigger a panic later in the boot process, once serial is enabled.
1143 	if (cpus_boot_arg_present && cpumask_boot_arg_present) {
1144 		cpu_config_correct = false;
1145 	}
1146 
1147 	err = SecureDTLookupEntry(NULL, "/cpus", &entry);
1148 	assert(err == kSuccess);
1149 
1150 	err = SecureDTInitEntryIterator(entry, &iter);
1151 	assert(err == kSuccess);
1152 
1153 	for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1154 		cluster_offsets[i] = -1;
1155 		cluster_phys_to_logical[i] = -1;
1156 		cluster_max_cpu_phys_id[i] = 0;
1157 	}
1158 
1159 	while (kSuccess == SecureDTIterateEntries(&iter, &child)) {
1160 		boolean_t is_boot_cpu = ml_is_boot_cpu(child);
1161 		boolean_t cpu_enabled = cpumask_boot_arg & 1;
1162 		cpumask_boot_arg >>= 1;
1163 
1164 		// Boot CPU disabled in cpumask. Flag this so that we trigger a panic
1165 		// later in the boot process, once serial is enabled.
1166 		if (is_boot_cpu && !cpu_enabled) {
1167 			cpu_config_correct = false;
1168 		}
1169 
1170 		// Ignore this CPU if it has been disabled by the cpumask= boot-arg.
1171 		if (!is_boot_cpu && !cpu_enabled) {
1172 			continue;
1173 		}
1174 
1175 		// If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
1176 		// been added to the topology struct yet, and we only have one slot left, then skip
1177 		// every other non-boot CPU in order to leave room for the boot CPU.
1178 		//
1179 		// e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
1180 		// array will list CPU0, CPU1, and CPU4.  CPU2-CPU3 and CPU5-CPUn will be omitted.
1181 		if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
1182 			continue;
1183 		}
1184 		if (topology_info.num_cpus >= cpu_boot_arg) {
1185 			break;
1186 		}
1187 
1188 		ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
1189 
1190 		cpu->cpu_id = topology_info.num_cpus++;
1191 		assert(cpu->cpu_id < MAX_CPUS);
1192 		topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
1193 
1194 		cpu->die_id = (int)ml_readprop(child, "die-id", 0);
1195 		topology_info.max_die_id = MAX(topology_info.max_die_id, cpu->die_id);
1196 
1197 		cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY);
1198 
1199 		cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0);
1200 		cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0);
1201 		cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
1202 		cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
1203 
1204 		ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
1205 		ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
1206 		ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
1207 		cpu->cluster_type = CLUSTER_TYPE_SMP;
1208 
1209 		int cluster_type = (int)ml_readprop(child, "cluster-type", 0);
1210 		if (cluster_type == 'E') {
1211 			cpu->cluster_type = CLUSTER_TYPE_E;
1212 		} else if (cluster_type == 'P') {
1213 			cpu->cluster_type = CLUSTER_TYPE_P;
1214 		}
1215 
1216 		if (ml_readprop(child, "cluster-power-down", 0)) {
1217 			topology_info.cluster_power_down = 1;
1218 		}
1219 
1220 		topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1221 
1222 		/*
1223 		 * Since we want to keep a linear cluster ID space, we cannot just rely
1224 		 * on the value provided by EDT. Instead, use the MPIDR value to see if we have
1225 		 * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1226 		 */
1227 #if HAS_CLUSTER
1228 		uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1229 #else
1230 		uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1231 #endif
1232 		assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1233 		cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
1234 		    topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1235 
1236 		assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1237 
1238 		ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1239 		if (cluster->num_cpus == 0) {
1240 			assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1241 
1242 			topology_info.num_clusters++;
1243 			topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1244 			topology_info.cluster_types |= (1 << cpu->cluster_type);
1245 
1246 			cluster->cluster_id = cpu->cluster_id;
1247 			cluster->die_id = cpu->die_id;
1248 			cluster->cluster_type = cpu->cluster_type;
1249 			cluster->first_cpu_id = cpu->cpu_id;
1250 			assert(cluster_phys_to_logical[phys_cluster_id] == -1);
1251 			cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1252 
1253 			topology_info.cluster_type_num_clusters[cluster->cluster_type]++;
1254 
1255 			// Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1256 			// If we wind up with a bunch of these, we might want to create separate per-cluster
1257 			// EDT nodes and have the CPU nodes reference them through a phandle.
1258 			ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len);
1259 			ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len);
1260 		}
1261 
1262 #if HAS_CLUSTER
1263 		if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1264 			cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1265 		}
1266 #endif
1267 
1268 		cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1269 		cluster->die_cluster_id = cpu->die_cluster_id;
1270 
1271 		cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1272 
1273 		cluster->num_cpus++;
1274 		cluster->cpu_mask |= 1ULL << cpu->cpu_id;
1275 
1276 		if (is_boot_cpu) {
1277 			assert(topology_info.boot_cpu == NULL);
1278 			topology_info.boot_cpu = cpu;
1279 			topology_info.boot_cluster = cluster;
1280 		}
1281 
1282 #if CONFIG_SPTM
1283 		sptm_register_cpu(cpu->phys_id);
1284 #endif
1285 	}
1286 
1287 #if HAS_CLUSTER
1288 	/*
1289 	 * Build the cluster offset array, ensuring that the region reserved
1290 	 * for each physical cluster contains enough entries to be indexed
1291 	 * by the maximum physical CPU ID (AFF0) within the cluster.
1292 	 */
1293 	unsigned int cur_cluster_offset = 0;
1294 	for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1295 		if (cluster_phys_to_logical[i] != -1) {
1296 			cluster_offsets[i] = cur_cluster_offset;
1297 			cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
1298 		}
1299 	}
1300 	assert(cur_cluster_offset <= MAX_CPUS);
1301 #else
1302 	/*
1303 	 * For H10, there are really 2 physical clusters, but they are not separated
1304 	 * into distinct ACCs.  AFF1 therefore always reports 0, and AFF0 numbering
1305 	 * is linear across both clusters.   For the purpose of MPIDR_EL1-based indexing,
1306 	 * treat H10 and earlier devices as though they contain a single cluster.
1307 	 */
1308 	cluster_offsets[0] = 0;
1309 #endif
1310 	assert(topology_info.boot_cpu != NULL);
1311 	ml_read_chip_revision(&topology_info.chip_revision);
1312 	ml_cluster_power_override(&topology_info.cluster_power_down);
1313 
1314 	/*
1315 	 * Set TPIDR_EL0 to indicate the correct cpu number & cluster id,
1316 	 * as we may not be booting from cpu 0. Userspace will consume
1317 	 * the current CPU number through this register. For non-boot
1318 	 * cores, this is done in start.s (start_cpu) using the per-cpu
1319 	 * data object.
1320 	 */
1321 	ml_topology_cpu_t *boot_cpu = topology_info.boot_cpu;
1322 	uint64_t tpidr_el0 = ((boot_cpu->cpu_id << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) | \
1323 	    ((boot_cpu->cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1324 	assert(((tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == boot_cpu->cpu_id);
1325 	assert(((tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == boot_cpu->cluster_id);
1326 	__builtin_arm_wsr64("TPIDR_EL0", tpidr_el0);
1327 
1328 	__builtin_arm_wsr64("TPIDRRO_EL0", 0);
1329 }
1330 
1331 const ml_topology_info_t *
ml_get_topology_info(void)1332 ml_get_topology_info(void)
1333 {
1334 	return &topology_info;
1335 }
1336 
1337 void
ml_map_cpu_pio(void)1338 ml_map_cpu_pio(void)
1339 {
1340 	unsigned int i;
1341 
1342 	for (i = 0; i < topology_info.num_cpus; i++) {
1343 		ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1344 		if (cpu->cpu_IMPL_pa) {
1345 			cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len);
1346 			cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len);
1347 		}
1348 		if (cpu->cpu_UTTDBG_pa) {
1349 			cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len);
1350 		}
1351 	}
1352 
1353 	for (i = 0; i < topology_info.num_clusters; i++) {
1354 		ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1355 		if (cluster->acc_IMPL_pa) {
1356 			cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len);
1357 		}
1358 		if (cluster->cpm_IMPL_pa) {
1359 			cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len);
1360 		}
1361 	}
1362 }
1363 
1364 __mockable unsigned int
ml_get_cpu_count(void)1365 ml_get_cpu_count(void)
1366 {
1367 	return topology_info.num_cpus;
1368 }
1369 
1370 unsigned int
ml_get_cluster_count(void)1371 ml_get_cluster_count(void)
1372 {
1373 	return topology_info.num_clusters;
1374 }
1375 
1376 int
ml_get_boot_cpu_number(void)1377 ml_get_boot_cpu_number(void)
1378 {
1379 	return topology_info.boot_cpu->cpu_id;
1380 }
1381 
1382 cluster_type_t
ml_get_boot_cluster_type(void)1383 ml_get_boot_cluster_type(void)
1384 {
1385 	return topology_info.boot_cluster->cluster_type;
1386 }
1387 
1388 int
ml_get_cpu_number(uint32_t phys_id)1389 ml_get_cpu_number(uint32_t phys_id)
1390 {
1391 	phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
1392 
1393 	for (unsigned i = 0; i < topology_info.num_cpus; i++) {
1394 		if (topology_info.cpus[i].phys_id == phys_id) {
1395 			return i;
1396 		}
1397 	}
1398 
1399 	return -1;
1400 }
1401 
1402 int
ml_get_cluster_number(uint32_t phys_id)1403 ml_get_cluster_number(uint32_t phys_id)
1404 {
1405 	int cpu_id = ml_get_cpu_number(phys_id);
1406 	if (cpu_id < 0) {
1407 		return -1;
1408 	}
1409 
1410 	ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1411 
1412 	return cpu->cluster_id;
1413 }
1414 
1415 unsigned int
ml_get_cpu_number_local(void)1416 ml_get_cpu_number_local(void)
1417 {
1418 	uint64_t mpidr_el1_value = 0;
1419 	unsigned cpu_id;
1420 
1421 	/* We identify the CPU based on the constant bits of MPIDR_EL1. */
1422 	MRS(mpidr_el1_value, "MPIDR_EL1");
1423 	cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value);
1424 
1425 	assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1426 
1427 	return cpu_id;
1428 }
1429 
1430 int
ml_get_cluster_number_local()1431 ml_get_cluster_number_local()
1432 {
1433 	uint64_t mpidr_el1_value = 0;
1434 	unsigned cluster_id;
1435 
1436 	/* We identify the cluster based on the constant bits of MPIDR_EL1. */
1437 	MRS(mpidr_el1_value, "MPIDR_EL1");
1438 	cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value);
1439 
1440 	assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1441 
1442 	return cluster_id;
1443 }
1444 
1445 int
ml_get_max_cpu_number(void)1446 ml_get_max_cpu_number(void)
1447 {
1448 	return topology_info.max_cpu_id;
1449 }
1450 
1451 int
ml_get_max_cluster_number(void)1452 ml_get_max_cluster_number(void)
1453 {
1454 	return topology_info.max_cluster_id;
1455 }
1456 
1457 unsigned int
ml_get_first_cpu_id(unsigned int cluster_id)1458 ml_get_first_cpu_id(unsigned int cluster_id)
1459 {
1460 	return topology_info.clusters[cluster_id].first_cpu_id;
1461 }
1462 
1463 static_assert(MAX_CPUS <= 256, "MAX_CPUS must fit in _COMM_PAGE_CPU_TO_CLUSTER; Increase table size if needed");
1464 
1465 void
ml_map_cpus_to_clusters(uint8_t * table)1466 ml_map_cpus_to_clusters(uint8_t *table)
1467 {
1468 	for (uint16_t cpu_id = 0; cpu_id < topology_info.num_cpus; cpu_id++) {
1469 		*(table + cpu_id) = (uint8_t)(topology_info.cpus[cpu_id].cluster_id);
1470 	}
1471 }
1472 
1473 /*
1474  * Return the die id of a cluster.
1475  */
1476 unsigned int
ml_get_die_id(unsigned int cluster_id)1477 ml_get_die_id(unsigned int cluster_id)
1478 {
1479 	/*
1480 	 * The current implementation gets the die_id from the
1481 	 * first CPU of the cluster.
1482 	 * rdar://80917654 (Add the die_id field to the cluster topology info)
1483 	 */
1484 	unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1485 	return topology_info.cpus[first_cpu].die_id;
1486 }
1487 
1488 /*
1489  * Return the index of a cluster in its die.
1490  */
1491 unsigned int
ml_get_die_cluster_id(unsigned int cluster_id)1492 ml_get_die_cluster_id(unsigned int cluster_id)
1493 {
1494 	/*
1495 	 * The current implementation gets the die_id from the
1496 	 * first CPU of the cluster.
1497 	 * rdar://80917654 (Add the die_id field to the cluster topology info)
1498 	 */
1499 	unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1500 	return topology_info.cpus[first_cpu].die_cluster_id;
1501 }
1502 
1503 /*
1504  * Return the highest die id of the system.
1505  */
1506 unsigned int
ml_get_max_die_id(void)1507 ml_get_max_die_id(void)
1508 {
1509 	return topology_info.max_die_id;
1510 }
1511 
1512 void
ml_lockdown_init()1513 ml_lockdown_init()
1514 {
1515 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
1516 	rorgn_stash_range();
1517 #endif
1518 }
1519 
1520 kern_return_t
ml_lockdown_handler_register(lockdown_handler_t f,void * this)1521 ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1522 {
1523 	if (!f) {
1524 		return KERN_FAILURE;
1525 	}
1526 
1527 	assert(lockdown_done);
1528 	f(this); // XXX: f this whole function
1529 
1530 	return KERN_SUCCESS;
1531 }
1532 
1533 static mcache_flush_function mcache_flush_func;
1534 static void* mcache_flush_service;
1535 kern_return_t
ml_mcache_flush_callback_register(mcache_flush_function func,void * service)1536 ml_mcache_flush_callback_register(mcache_flush_function func, void *service)
1537 {
1538 	mcache_flush_service = service;
1539 	mcache_flush_func = func;
1540 
1541 	return KERN_SUCCESS;
1542 }
1543 
1544 kern_return_t
ml_mcache_flush(void)1545 ml_mcache_flush(void)
1546 {
1547 	if (!mcache_flush_func) {
1548 		panic("Cannot flush M$ with no flush callback registered");
1549 
1550 		return KERN_FAILURE;
1551 	} else {
1552 		return mcache_flush_func(mcache_flush_service);
1553 	}
1554 }
1555 
1556 
1557 kern_return_t ml_mem_fault_report_enable_register(void);
1558 kern_return_t
ml_mem_fault_report_enable_register(void)1559 ml_mem_fault_report_enable_register(void)
1560 {
1561 	return KERN_SUCCESS;
1562 }
1563 
1564 kern_return_t ml_amcc_error_inject_register(void);
1565 kern_return_t
ml_amcc_error_inject_register(void)1566 ml_amcc_error_inject_register(void)
1567 {
1568 	return KERN_SUCCESS;
1569 }
1570 
1571 kern_return_t ml_dcs_error_inject_register(void);
1572 kern_return_t
ml_dcs_error_inject_register(void)1573 ml_dcs_error_inject_register(void)
1574 {
1575 	return KERN_SUCCESS;
1576 }
1577 
1578 
1579 extern lck_mtx_t pset_create_lock;
1580 
1581 kern_return_t
ml_processor_register(ml_processor_info_t * in_processor_info,processor_t * processor_out,ipi_handler_t * ipi_handler_out,perfmon_interrupt_handler_func * pmi_handler_out)1582 ml_processor_register(ml_processor_info_t *in_processor_info,
1583     processor_t *processor_out, ipi_handler_t *ipi_handler_out,
1584     perfmon_interrupt_handler_func *pmi_handler_out)
1585 {
1586 	cpu_data_t *this_cpu_datap;
1587 	processor_set_t pset;
1588 	boolean_t  is_boot_cpu;
1589 	static unsigned int reg_cpu_count = 0;
1590 
1591 	if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1592 		return KERN_FAILURE;
1593 	}
1594 
1595 	if ((unsigned)OSIncrementAtomic((SInt32*)&reg_cpu_count) >= topology_info.num_cpus) {
1596 		return KERN_FAILURE;
1597 	}
1598 
1599 	if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1600 		is_boot_cpu = FALSE;
1601 		this_cpu_datap = cpu_data_alloc(FALSE);
1602 		cpu_data_init(this_cpu_datap);
1603 	} else {
1604 		this_cpu_datap = &BootCpuData;
1605 		is_boot_cpu = TRUE;
1606 		/*
1607 		 * Note that ml_processor_register happens for the boot cpu
1608 		 * *after* it starts running arbitrary threads, possibly
1609 		 * including *userspace*, depending on how long the CPU
1610 		 * services take to match.
1611 		 */
1612 	}
1613 
1614 	assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1615 
1616 	this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1617 
1618 	if (!is_boot_cpu) {
1619 		this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1620 		cpu_data_register(this_cpu_datap);
1621 		assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1622 	}
1623 
1624 	this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1625 	this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1626 	nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency);
1627 	this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr);
1628 
1629 	this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1630 	this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1631 
1632 	this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1633 	this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1634 	this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1635 
1636 	this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1637 	this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1638 	this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1639 	this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1640 	this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1641 	this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1642 
1643 	/*
1644 	 * Encode cpu_id, cluster_id to be stored in TPIDR_EL0 (see
1645 	 * cswitch.s:set_thread_registers, start.s:start_cpu) for consumption
1646 	 * by userspace.
1647 	 */
1648 	this_cpu_datap->cpu_tpidr_el0 = ((this_cpu_datap->cpu_number << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) | \
1649 	    ((this_cpu_datap->cpu_cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1650 	assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == this_cpu_datap->cpu_number);
1651 	assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == this_cpu_datap->cpu_cluster_id);
1652 
1653 #if HAS_CLUSTER
1654 	this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1655 #else /* HAS_CLUSTER */
1656 	this_cpu_datap->cluster_master = is_boot_cpu;
1657 #endif /* HAS_CLUSTER */
1658 	lck_mtx_lock(&pset_create_lock);
1659 	pset = pset_find(in_processor_info->cluster_id, NULL);
1660 	kprintf("[%d]%s>pset_find(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cluster_id, pset ? pset->pset_id : -1);
1661 	if (pset == NULL) {
1662 		pset = pset_create(this_cpu_datap->cpu_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1663 		assert(pset != PROCESSOR_SET_NULL);
1664 #if __AMP__
1665 		kprintf("[%d]%s>pset_create(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, this_cpu_datap->cpu_cluster_id, pset->pset_id);
1666 #endif /* __AMP__ */
1667 	}
1668 	kprintf("[%d]%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1669 	lck_mtx_unlock(&pset_create_lock);
1670 
1671 	processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1672 	if (!is_boot_cpu) {
1673 		processor_init(processor, this_cpu_datap->cpu_number, pset);
1674 	}
1675 
1676 	*processor_out = processor;
1677 	*ipi_handler_out = cpu_signal_handler;
1678 #if CPMU_AIC_PMI && CONFIG_CPU_COUNTERS
1679 	*pmi_handler_out = mt_cpmu_aic_pmi;
1680 #else
1681 	*pmi_handler_out = NULL;
1682 #endif /* CPMU_AIC_PMI && CONFIG_CPU_COUNTERS */
1683 	if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1684 		*in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1685 	}
1686 
1687 #if CONFIG_CPU_COUNTERS
1688 	kpc_register_cpu(this_cpu_datap);
1689 #endif /* CONFIG_CPU_COUNTERS */
1690 
1691 #ifdef APPLEEVEREST
1692 	/**
1693 	 * H15 SoCs have PIO lockdown applied at early boot for secondary CPUs.
1694 	 * Save PIO lock base addreses.
1695 	 */
1696 	const uint32_t log_id = in_processor_info->log_id;
1697 	const unsigned int cluster_id = topology_info.cpus[log_id].cluster_id;
1698 	this_cpu_datap->cpu_reg_paddr = topology_info.cpus[log_id].cpu_IMPL_pa;
1699 	this_cpu_datap->acc_reg_paddr = topology_info.clusters[cluster_id].acc_IMPL_pa;
1700 	this_cpu_datap->cpm_reg_paddr = topology_info.clusters[cluster_id].cpm_IMPL_pa;
1701 #endif
1702 
1703 
1704 	if (!is_boot_cpu) {
1705 		random_cpu_init(this_cpu_datap->cpu_number);
1706 		// now let next CPU register itself
1707 		OSIncrementAtomic((SInt32*)&real_ncpus);
1708 	}
1709 
1710 	os_atomic_or(&this_cpu_datap->cpu_flags, InitState, relaxed);
1711 
1712 #if !USE_APPLEARMSMP
1713 	/*
1714 	 * AppleARMCPU's external processor_start call is now a no-op, so
1715 	 * boot the processor directly when it's registered.
1716 	 *
1717 	 * It needs to be booted here for the boot processor to finish the
1718 	 * subsequent registerInterrupt operations and unblock the other cores.
1719 	 */
1720 	processor_boot(processor);
1721 #endif /* !USE_APPLEARMSMP */
1722 
1723 	return KERN_SUCCESS;
1724 }
1725 
1726 void
ml_init_arm_debug_interface(void * in_cpu_datap,vm_offset_t virt_address)1727 ml_init_arm_debug_interface(
1728 	void * in_cpu_datap,
1729 	vm_offset_t virt_address)
1730 {
1731 	((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1732 	do_debugid();
1733 }
1734 
1735 /*
1736  *	Routine:        init_ast_check
1737  *	Function:
1738  */
1739 void
init_ast_check(__unused processor_t processor)1740 init_ast_check(
1741 	__unused processor_t processor)
1742 {
1743 }
1744 
1745 /*
1746  *	Routine:        cause_ast_check
1747  *	Function:
1748  */
1749 void
cause_ast_check(processor_t processor)1750 cause_ast_check(
1751 	processor_t processor)
1752 {
1753 	assert(processor != PROCESSOR_NULL);
1754 
1755 	if (current_processor() != processor) {
1756 		cpu_signal(processor_to_cpu_datap(processor), SIGPast, (void *)NULL, (void *)NULL);
1757 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 1 /* ast */, 0, 0, 0);
1758 	}
1759 }
1760 
1761 extern uint32_t cpu_idle_count;
1762 
1763 void
ml_get_power_state(boolean_t * icp,boolean_t * pidlep)1764 ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
1765 {
1766 	*icp = ml_at_interrupt_context();
1767 	*pidlep = (cpu_idle_count == real_ncpus);
1768 }
1769 
1770 /*
1771  *	Routine:        ml_cause_interrupt
1772  *	Function:	Generate a fake interrupt
1773  */
1774 void
ml_cause_interrupt(void)1775 ml_cause_interrupt(void)
1776 {
1777 	return;                 /* BS_XXX */
1778 }
1779 
1780 /* Map memory map IO space */
1781 vm_offset_t
ml_io_map(vm_offset_t phys_addr,vm_size_t size)1782 ml_io_map(
1783 	vm_offset_t phys_addr,
1784 	vm_size_t size)
1785 {
1786 	return io_map(phys_addr, size, VM_WIMG_IO, VM_PROT_DEFAULT, false);
1787 }
1788 
1789 /* Map memory map IO space (with protections specified) */
1790 vm_offset_t
ml_io_map_with_prot(vm_offset_t phys_addr,vm_size_t size,vm_prot_t prot)1791 ml_io_map_with_prot(
1792 	vm_offset_t phys_addr,
1793 	vm_size_t size,
1794 	vm_prot_t prot)
1795 {
1796 	return io_map(phys_addr, size, VM_WIMG_IO, prot, false);
1797 }
1798 
1799 vm_offset_t
ml_io_map_unmappable(vm_offset_t phys_addr,vm_size_t size,unsigned int flags)1800 ml_io_map_unmappable(
1801 	vm_offset_t             phys_addr,
1802 	vm_size_t               size,
1803 	unsigned int            flags)
1804 {
1805 	return io_map(phys_addr, size, flags, VM_PROT_DEFAULT, true);
1806 }
1807 
1808 vm_offset_t
ml_io_map_wcomb(vm_offset_t phys_addr,vm_size_t size)1809 ml_io_map_wcomb(
1810 	vm_offset_t phys_addr,
1811 	vm_size_t size)
1812 {
1813 	return io_map(phys_addr, size, VM_WIMG_WCOMB, VM_PROT_DEFAULT, false);
1814 }
1815 
1816 void
ml_io_unmap(vm_offset_t addr,vm_size_t sz)1817 ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1818 {
1819 	pmap_remove(kernel_pmap, addr, addr + sz);
1820 	kmem_free(kernel_map, addr, sz);
1821 }
1822 
1823 vm_map_address_t
ml_map_high_window(vm_offset_t phys_addr,vm_size_t len)1824 ml_map_high_window(
1825 	vm_offset_t     phys_addr,
1826 	vm_size_t       len)
1827 {
1828 	return pmap_map_high_window_bd(phys_addr, len, VM_PROT_READ | VM_PROT_WRITE);
1829 }
1830 
1831 vm_offset_t
ml_static_ptovirt(vm_offset_t paddr)1832 ml_static_ptovirt(
1833 	vm_offset_t paddr)
1834 {
1835 	return phystokv(paddr);
1836 }
1837 
1838 vm_offset_t
ml_static_slide(vm_offset_t vaddr)1839 ml_static_slide(
1840 	vm_offset_t vaddr)
1841 {
1842 	vm_offset_t slid_vaddr = 0;
1843 
1844 #if CONFIG_SPTM
1845 	if ((vaddr >= vm_sptm_offsets.unslid_base) && (vaddr < vm_sptm_offsets.unslid_top)) {
1846 		slid_vaddr = vaddr + vm_sptm_offsets.slide;
1847 	} else if ((vaddr >= vm_txm_offsets.unslid_base) && (vaddr < vm_txm_offsets.unslid_top)) {
1848 		slid_vaddr = vaddr + vm_txm_offsets.slide;
1849 	} else
1850 #endif /* CONFIG_SPTM */
1851 	{
1852 		slid_vaddr = vaddr + vm_kernel_slide;
1853 	}
1854 
1855 	if (!VM_KERNEL_IS_SLID(slid_vaddr)) {
1856 		/* This is only intended for use on static kernel addresses. */
1857 		return 0;
1858 	}
1859 
1860 	return slid_vaddr;
1861 }
1862 
1863 vm_offset_t
ml_static_unslide(vm_offset_t vaddr)1864 ml_static_unslide(
1865 	vm_offset_t vaddr)
1866 {
1867 	if (!VM_KERNEL_IS_SLID(vaddr)) {
1868 		/* This is only intended for use on static kernel addresses. */
1869 		return 0;
1870 	}
1871 
1872 #if CONFIG_SPTM
1873 	/**
1874 	 * Addresses coming from the SPTM and TXM have a different slide than the
1875 	 * rest of the kernel.
1876 	 */
1877 	if ((vaddr >= vm_sptm_offsets.slid_base) && (vaddr < vm_sptm_offsets.slid_top)) {
1878 		return vaddr - vm_sptm_offsets.slide;
1879 	}
1880 
1881 	if ((vaddr >= vm_txm_offsets.slid_base) && (vaddr < vm_txm_offsets.slid_top)) {
1882 		return vaddr - vm_txm_offsets.slide;
1883 	}
1884 #endif /* CONFIG_SPTM */
1885 
1886 	return vaddr - vm_kernel_slide;
1887 }
1888 
1889 extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1890 
1891 kern_return_t
ml_static_protect(vm_offset_t vaddr,vm_size_t size,vm_prot_t new_prot __unused)1892 ml_static_protect(
1893 	vm_offset_t vaddr, /* kernel virtual address */
1894 	vm_size_t size,
1895 	vm_prot_t new_prot __unused)
1896 {
1897 #if CONFIG_SPTM
1898 	/**
1899 	 * Retype any frames that may be passed to the VM to XNU_DEFAULT.
1900 	 */
1901 	for (vm_offset_t sptm_vaddr_cur = vaddr; sptm_vaddr_cur < trunc_page_64(vaddr + size); sptm_vaddr_cur += PAGE_SIZE) {
1902 		/* Check if this frame is XNU_DEFAULT and only retype it if is not */
1903 		sptm_paddr_t sptm_paddr_cur = kvtophys_nofail(sptm_vaddr_cur);
1904 		sptm_frame_type_t current_type = sptm_get_frame_type(sptm_paddr_cur);
1905 		if (current_type != XNU_DEFAULT) {
1906 			sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1907 			sptm_retype(sptm_paddr_cur, current_type, XNU_DEFAULT, retype_params);
1908 		}
1909 	}
1910 
1911 	return KERN_SUCCESS;
1912 #else /* CONFIG_SPTM */
1913 	pt_entry_t    arm_prot = 0;
1914 	pt_entry_t    arm_block_prot = 0;
1915 	vm_offset_t   vaddr_cur;
1916 	ppnum_t       ppn;
1917 	kern_return_t result = KERN_SUCCESS;
1918 
1919 	if (vaddr < physmap_base) {
1920 		panic("ml_static_protect(): %p < %p", (void *) vaddr, (void *) physmap_base);
1921 		return KERN_FAILURE;
1922 	}
1923 
1924 	assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1925 
1926 	if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1927 		panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1928 	}
1929 	if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1930 		panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1931 	}
1932 
1933 	/* Set up the protection bits, and block bits so we can validate block mappings. */
1934 	if (new_prot & VM_PROT_WRITE) {
1935 		arm_prot |= ARM_PTE_AP(AP_RWNA);
1936 		arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RWNA);
1937 	} else {
1938 		arm_prot |= ARM_PTE_AP(AP_RONA);
1939 		arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RONA);
1940 	}
1941 
1942 	arm_prot |= ARM_PTE_NX;
1943 	arm_block_prot |= ARM_TTE_BLOCK_NX;
1944 
1945 	if (!(new_prot & VM_PROT_EXECUTE)) {
1946 		arm_prot |= ARM_PTE_PNX;
1947 		arm_block_prot |= ARM_TTE_BLOCK_PNX;
1948 	}
1949 
1950 	for (vaddr_cur = vaddr;
1951 	    vaddr_cur < trunc_page_64(vaddr + size);
1952 	    vaddr_cur += PAGE_SIZE) {
1953 		ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1954 		if (ppn != (vm_offset_t) NULL) {
1955 			tt_entry_t      *tte2;
1956 			pt_entry_t      *pte_p;
1957 			pt_entry_t      ptmp;
1958 
1959 #if XNU_MONITOR
1960 			assert(!pmap_is_monitor(ppn));
1961 			assert(!TEST_PAGE_RATIO_4);
1962 #endif
1963 
1964 			tte2 = arm_kva_to_tte(vaddr_cur);
1965 
1966 			if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1967 				if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1968 				    ((*tte2 & (ARM_TTE_BLOCK_NXMASK | ARM_TTE_BLOCK_PNXMASK | ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1969 					/*
1970 					 * We can support ml_static_protect on a block mapping if the mapping already has
1971 					 * the desired protections.  We still want to run checks on a per-page basis.
1972 					 */
1973 					continue;
1974 				}
1975 
1976 				result = KERN_FAILURE;
1977 				break;
1978 			}
1979 
1980 			pte_p = (pt_entry_t *)&((tt_entry_t*)(phystokv((*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1981 			ptmp = *pte_p;
1982 
1983 			if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot)) {
1984 				/*
1985 				 * The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
1986 				 * protections do not match the desired protections, then we will fail (as we cannot update
1987 				 * this mapping without updating other mappings as well).
1988 				 */
1989 				result = KERN_FAILURE;
1990 				break;
1991 			}
1992 
1993 			__unreachable_ok_push
1994 			if (TEST_PAGE_RATIO_4) {
1995 				{
1996 					unsigned int    i;
1997 					pt_entry_t      *ptep_iter;
1998 
1999 					ptep_iter = pte_p;
2000 					for (i = 0; i < 4; i++, ptep_iter++) {
2001 						/* Note that there is a hole in the HINT sanity checking here. */
2002 						ptmp = *ptep_iter;
2003 
2004 						/* We only need to update the page tables if the protections do not match. */
2005 						if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
2006 							ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
2007 							*ptep_iter = ptmp;
2008 						}
2009 					}
2010 				}
2011 			} else {
2012 				ptmp = *pte_p;
2013 				/* We only need to update the page tables if the protections do not match. */
2014 				if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
2015 					ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
2016 					*pte_p = ptmp;
2017 				}
2018 			}
2019 			__unreachable_ok_pop
2020 		}
2021 	}
2022 
2023 	if (vaddr_cur > vaddr) {
2024 		assert(((vaddr_cur - vaddr) & 0xFFFFFFFF00000000ULL) == 0);
2025 		flush_mmu_tlb_region(vaddr, (uint32_t)(vaddr_cur - vaddr));
2026 	}
2027 
2028 
2029 	return result;
2030 #endif /* CONFIG_SPTM */
2031 }
2032 
2033 #if defined(CONFIG_SPTM)
2034 /*
2035  * Returns true if the given physical address is in one of the boot kernelcache ranges.
2036  */
2037 static bool
ml_physaddr_in_bootkc_range(vm_offset_t physaddr)2038 ml_physaddr_in_bootkc_range(vm_offset_t physaddr)
2039 {
2040 	for (int i = 0; i < arm_vm_kernelcache_numranges; i++) {
2041 		if (physaddr >= arm_vm_kernelcache_ranges[i].start_phys && physaddr < arm_vm_kernelcache_ranges[i].end_phys) {
2042 			return true;
2043 		}
2044 	}
2045 	return false;
2046 }
2047 #endif /* defined(CONFIG_SPTM) */
2048 
2049 /*
2050  * List of ml_static_mfree()'d pages that have been freed before
2051  * physical aperture sliding has taken place. If sliding has not
2052  * occurred yet, ml_static_mfree() will create pages, but not add them
2053  * to the free page queue yet. If it did, code that e.g. calls
2054  * pmap_page_alloc() could get a page back whose physical aperture
2055  * will later be slid, potentially leaving dangling pointers pointing
2056  * to the old kva of the page behind.
2057  *
2058  * Such errors are hard to avoid and hard to debug, so instead we
2059  * queue pages in this dedicated list, and release all accumulated
2060  * pages into the regular free queue all at once right after phys
2061  * aperture sliding took place in arm_vm_prot_finalize().
2062  */
2063 static
2064 vm_page_list_t ml_static_mfree_pre_slide_list;
2065 
2066 /*
2067  * Indicates whether we still need ml_static_mfree() to queue up pages
2068  * in ml_static_free_pre_slide_list. If not, ml_static_mfree()
2069  * directly releases newly created pages into the free queue instead.
2070  */
2071 static
2072 bool ml_static_mfree_queue_up = true;
2073 
2074 /*
2075  * Release all pages queued up by ml_static_mfree() to the free queue.
2076  * This should be called after physical aperture sliding has taken
2077  * place (i.e. in arm_vm_prot_finalize()), to indicate that the
2078  * physical aperture is now stable, and subsequently ml_static_mfree()
2079  * can directly release pages into the free queue instead.
2080  */
2081 static void
ml_release_deferred_pages(void)2082 ml_release_deferred_pages(void)
2083 {
2084 	vm_page_free_list(ml_static_mfree_pre_slide_list.vmpl_head, false);
2085 	ml_static_mfree_queue_up = false;
2086 }
2087 
2088 /*
2089  *	Routine:        ml_static_mfree
2090  *	Function:
2091  */
2092 void
ml_static_mfree(vm_offset_t vaddr,vm_size_t size)2093 ml_static_mfree(
2094 	vm_offset_t vaddr,
2095 	vm_size_t   size)
2096 {
2097 	vm_offset_t vaddr_cur;
2098 	vm_offset_t paddr_cur;
2099 	ppnum_t     ppn;
2100 	uint32_t    freed_pages = 0;
2101 	uint32_t    freed_kernelcache_pages = 0;
2102 
2103 
2104 	/* It is acceptable (if bad) to fail to free. */
2105 	if (vaddr < physmap_base) {
2106 		return;
2107 	}
2108 
2109 	assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
2110 
2111 	for (vaddr_cur = vaddr;
2112 	    vaddr_cur < trunc_page_64(vaddr + size);
2113 	    vaddr_cur += PAGE_SIZE) {
2114 		/*
2115 		 * Some clients invoke ml_static_mfree on non-physical aperture
2116 		 * addresses.  To support this, we convert the virtual address
2117 		 * to a physical aperture address, and remove all mappings of
2118 		 * the page as we update the physical aperture protections.
2119 		 */
2120 		vm_offset_t vaddr_papt = phystokv(kvtophys(vaddr_cur));
2121 		ppn = pmap_find_phys(kernel_pmap, vaddr_papt);
2122 
2123 		if (ppn != (vm_offset_t) NULL) {
2124 			/*
2125 			 * It is not acceptable to fail to update the protections on a page
2126 			 * we will release to the VM.  We need to either panic or continue.
2127 			 * For now, we'll panic (to help flag if there is memory we can
2128 			 * reclaim).
2129 			 */
2130 			pmap_disconnect(ppn);
2131 			if (ml_static_protect(vaddr_papt, PAGE_SIZE, VM_PROT_WRITE | VM_PROT_READ) != KERN_SUCCESS) {
2132 				panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
2133 			}
2134 
2135 			paddr_cur = ptoa(ppn);
2136 
2137 
2138 			if (__probable(!ml_static_mfree_queue_up)) {
2139 				vm_page_create_canonical(ppn);
2140 			} else {
2141 				vm_page_t m = vm_page_create(ppn, true, Z_WAITOK);
2142 
2143 				vm_page_list_push(&ml_static_mfree_pre_slide_list, m);
2144 			}
2145 
2146 			freed_pages++;
2147 #if defined(CONFIG_SPTM)
2148 			if (ml_physaddr_in_bootkc_range(paddr_cur))
2149 #else
2150 			if (paddr_cur >= arm_vm_kernelcache_phys_start && paddr_cur < arm_vm_kernelcache_phys_end)
2151 #endif
2152 			{
2153 				freed_kernelcache_pages++;
2154 			}
2155 		}
2156 	}
2157 
2158 	vm_page_lockspin_queues();
2159 	vm_page_wire_count -= freed_pages;
2160 	vm_page_wire_count_initial -= freed_pages;
2161 	vm_page_kernelcache_count -= freed_kernelcache_pages;
2162 	vm_page_unlock_queues();
2163 #if     DEBUG
2164 	kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
2165 #endif
2166 }
2167 
2168 /*
2169  * Routine: ml_page_protection_type
2170  * Function: Returns the type of page protection that the system supports.
2171  */
2172 ml_page_protection_t
ml_page_protection_type(void)2173 ml_page_protection_type(void)
2174 {
2175 #if CONFIG_SPTM
2176 	return 2;
2177 #elif XNU_MONITOR
2178 	return 1;
2179 #else
2180 	return 0;
2181 #endif
2182 }
2183 
2184 /* virtual to physical on wired pages */
2185 vm_offset_t
ml_vtophys(vm_offset_t vaddr)2186 ml_vtophys(vm_offset_t vaddr)
2187 {
2188 	return kvtophys(vaddr);
2189 }
2190 
2191 /*
2192  * Routine: ml_nofault_copy
2193  * Function: Perform a physical mode copy if the source and destination have
2194  * valid translations in the kernel pmap. If translations are present, they are
2195  * assumed to be wired; e.g., no attempt is made to guarantee that the
2196  * translations obtained remain valid for the duration of the copy process.
2197  */
2198 vm_size_t
ml_nofault_copy(vm_offset_t virtsrc,vm_offset_t virtdst,vm_size_t size)2199 ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
2200 {
2201 	addr64_t        cur_phys_dst, cur_phys_src;
2202 	vm_size_t       count, nbytes = 0;
2203 
2204 	while (size > 0) {
2205 		if (!(cur_phys_src = kvtophys(virtsrc))) {
2206 			break;
2207 		}
2208 		if (!(cur_phys_dst = kvtophys(virtdst))) {
2209 			break;
2210 		}
2211 		if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) ||
2212 		    !pmap_valid_address(trunc_page_64(cur_phys_src))) {
2213 			break;
2214 		}
2215 		count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
2216 		if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
2217 			count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
2218 		}
2219 		if (count > size) {
2220 			count = size;
2221 		}
2222 
2223 		bcopy_phys(cur_phys_src, cur_phys_dst, count);
2224 
2225 		nbytes += count;
2226 		virtsrc += count;
2227 		virtdst += count;
2228 		size -= count;
2229 	}
2230 
2231 	return nbytes;
2232 }
2233 
2234 /*
2235  *	Routine:        ml_validate_nofault
2236  *	Function: Validate that ths address range has a valid translations
2237  *			in the kernel pmap.  If translations are present, they are
2238  *			assumed to be wired; i.e. no attempt is made to guarantee
2239  *			that the translation persist after the check.
2240  *  Returns: TRUE if the range is mapped and will not cause a fault,
2241  *			FALSE otherwise.
2242  */
2243 
2244 boolean_t
ml_validate_nofault(vm_offset_t virtsrc,vm_size_t size)2245 ml_validate_nofault(
2246 	vm_offset_t virtsrc, vm_size_t size)
2247 {
2248 	addr64_t cur_phys_src;
2249 	uint32_t count;
2250 
2251 	while (size > 0) {
2252 		if (!(cur_phys_src = kvtophys(virtsrc))) {
2253 			return FALSE;
2254 		}
2255 		if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
2256 			return FALSE;
2257 		}
2258 		count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
2259 		if (count > size) {
2260 			count = (uint32_t)size;
2261 		}
2262 
2263 		virtsrc += count;
2264 		size -= count;
2265 	}
2266 
2267 	return TRUE;
2268 }
2269 
2270 void
ml_get_bouncepool_info(vm_offset_t * phys_addr,vm_size_t * size)2271 ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
2272 {
2273 	*phys_addr = 0;
2274 	*size = 0;
2275 }
2276 
2277 void
active_rt_threads(__unused boolean_t active)2278 active_rt_threads(__unused boolean_t active)
2279 {
2280 }
2281 
2282 static void
cpu_qos_cb_default(__unused int urgency,__unused uint64_t qos_param1,__unused uint64_t qos_param2)2283 cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
2284 {
2285 	return;
2286 }
2287 
2288 cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
2289 
2290 void
cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)2291 cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
2292 {
2293 	if (cpu_qos_cb != NULL) {
2294 		cpu_qos_update = cpu_qos_cb;
2295 	} else {
2296 		cpu_qos_update = cpu_qos_cb_default;
2297 	}
2298 }
2299 
2300 void
thread_tell_urgency(thread_urgency_t urgency,uint64_t rt_period,uint64_t rt_deadline,uint64_t sched_latency __unused,__unused thread_t nthread)2301 thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
2302 {
2303 	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
2304 
2305 	cpu_qos_update((int)urgency, rt_period, rt_deadline);
2306 
2307 	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
2308 }
2309 
2310 void
machine_run_count(__unused uint32_t count)2311 machine_run_count(__unused uint32_t count)
2312 {
2313 }
2314 
2315 #if KASAN
2316 vm_offset_t ml_stack_base(void);
2317 vm_size_t ml_stack_size(void);
2318 
2319 vm_offset_t
ml_stack_base(void)2320 ml_stack_base(void)
2321 {
2322 	uintptr_t local = (uintptr_t) &local;
2323 	vm_offset_t     intstack_top_ptr;
2324 
2325 	intstack_top_ptr = getCpuDatap()->intstack_top;
2326 	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2327 		return intstack_top_ptr - INTSTACK_SIZE;
2328 	} else {
2329 		return current_thread()->kernel_stack;
2330 	}
2331 }
2332 vm_size_t
ml_stack_size(void)2333 ml_stack_size(void)
2334 {
2335 	uintptr_t local = (uintptr_t) &local;
2336 	vm_offset_t     intstack_top_ptr;
2337 
2338 	intstack_top_ptr = getCpuDatap()->intstack_top;
2339 	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2340 		return INTSTACK_SIZE;
2341 	} else {
2342 		return kernel_stack_size;
2343 	}
2344 }
2345 #endif
2346 
2347 #ifdef CONFIG_KCOV
2348 
2349 kcov_cpu_data_t *
current_kcov_data(void)2350 current_kcov_data(void)
2351 {
2352 	return &current_cpu_datap()->cpu_kcov_data;
2353 }
2354 
2355 kcov_cpu_data_t *
cpu_kcov_data(int cpuid)2356 cpu_kcov_data(int cpuid)
2357 {
2358 	return &cpu_datap(cpuid)->cpu_kcov_data;
2359 }
2360 
2361 #endif /* CONFIG_KCOV */
2362 
2363 boolean_t
machine_timeout_suspended(void)2364 machine_timeout_suspended(void)
2365 {
2366 	return FALSE;
2367 }
2368 
2369 kern_return_t
ml_interrupt_prewarm(__unused uint64_t deadline)2370 ml_interrupt_prewarm(__unused uint64_t deadline)
2371 {
2372 	return KERN_FAILURE;
2373 }
2374 
2375 #if HAS_APPLE_GENERIC_TIMER
2376 /* The kernel timer APIs always use the Apple timebase */
2377 #define KERNEL_CNTV_TVAL_EL0 "S3_1_C15_C15_4"
2378 #define KERNEL_CNTVCT_EL0    "S3_4_C15_C11_7"
2379 #define KERNEL_CNTVCTSS_EL0  "S3_4_C15_C10_6"
2380 #define KERNEL_CNTV_CTL_EL0  "S3_1_C15_C0_5"
2381 #define KERNEL_CNTKCTL_EL1   "S3_4_C15_C9_6"
2382 #else
2383 #define KERNEL_CNTV_TVAL_EL0 "CNTV_TVAL_EL0"
2384 #define KERNEL_CNTVCT_EL0    "CNTVCT_EL0"
2385 #define KERNEL_CNTVCTSS_EL0  "CNTVCTSS_EL0"
2386 #define KERNEL_CNTV_CTL_EL0  "CNTV_CTL_EL0"
2387 #define KERNEL_CNTKCTL_EL1   "CNTKCTL_EL1"
2388 #endif
2389 
2390 /*
2391  * Assumes fiq, irq disabled.
2392  */
2393 void
ml_set_decrementer(uint32_t dec_value)2394 ml_set_decrementer(uint32_t dec_value)
2395 {
2396 	cpu_data_t      *cdp = getCpuDatap();
2397 
2398 	assert(ml_get_interrupts_enabled() == FALSE);
2399 	cdp->cpu_decrementer = dec_value;
2400 
2401 	if (cdp->cpu_set_decrementer_func) {
2402 		cdp->cpu_set_decrementer_func(dec_value);
2403 	} else {
2404 		__builtin_arm_wsr64(KERNEL_CNTV_TVAL_EL0, (uint64_t)dec_value);
2405 	}
2406 }
2407 
2408 /**
2409  * Perform a read of the timebase which is permitted to be executed
2410  * speculatively and/or out of program order.
2411  */
2412 static inline uint64_t
speculative_timebase(void)2413 speculative_timebase(void)
2414 {
2415 	return __builtin_arm_rsr64(KERNEL_CNTVCT_EL0);
2416 }
2417 
2418 /**
2419  * Read a non-speculative view of the timebase if one is available,
2420  * otherwise fallback on an ISB to prevent prevent speculation and
2421  * enforce ordering.
2422  */
2423 static inline uint64_t
nonspeculative_timebase(void)2424 nonspeculative_timebase(void)
2425 {
2426 #if   __ARM_ARCH_8_6__
2427 	return __builtin_arm_rsr64(KERNEL_CNTVCTSS_EL0);
2428 #else
2429 	// ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
2430 	// "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
2431 	// to other instructions executed on the same processor."
2432 	__builtin_arm_isb(ISB_SY);
2433 	return speculative_timebase();
2434 #endif
2435 }
2436 
2437 
2438 uint64_t
ml_get_hwclock()2439 ml_get_hwclock()
2440 {
2441 	uint64_t timebase = nonspeculative_timebase();
2442 	return timebase;
2443 }
2444 
2445 uint64_t
ml_get_hwclock_speculative()2446 ml_get_hwclock_speculative()
2447 {
2448 	uint64_t timebase = speculative_timebase();
2449 	return timebase;
2450 }
2451 
2452 uint64_t
ml_get_timebase()2453 ml_get_timebase()
2454 {
2455 	uint64_t clock, timebase;
2456 
2457 	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
2458 	do {
2459 		timebase = getCpuDatap()->cpu_base_timebase;
2460 		os_compiler_barrier();
2461 		clock = ml_get_hwclock();
2462 		os_compiler_barrier();
2463 	} while (getCpuDatap()->cpu_base_timebase != timebase);
2464 
2465 	return clock + timebase;
2466 }
2467 
2468 /**
2469  * Issue a barrier that guarantees all prior memory accesses will complete
2470  * before any subsequent timebase reads.
2471  */
2472 void
ml_memory_to_timebase_fence(void)2473 ml_memory_to_timebase_fence(void)
2474 {
2475 	__builtin_arm_dmb(DMB_SY);
2476 	const uint64_t take_backwards_branch = 0;
2477 	asm volatile (
2478         "1:"
2479                 "ldr	x0, [%[take_backwards_branch]]" "\n"
2480                 "cbnz	x0, 1b"                         "\n"
2481                 :
2482                 : [take_backwards_branch] "r"(&take_backwards_branch)
2483                 : "x0"
2484         );
2485 
2486 	/* throwaway read to prevent ml_get_speculative_timebase() reordering */
2487 	(void)ml_get_hwclock();
2488 }
2489 
2490 /**
2491  * Issue a barrier that guarantees all prior timebase reads will
2492  * be ordered before any subsequent memory accesses.
2493  */
2494 void
ml_timebase_to_memory_fence(void)2495 ml_timebase_to_memory_fence(void)
2496 {
2497 	__builtin_arm_isb(ISB_SY);
2498 }
2499 
2500 /*
2501  * Get the speculative timebase without an ISB.
2502  */
2503 uint64_t
ml_get_speculative_timebase(void)2504 ml_get_speculative_timebase(void)
2505 {
2506 	uint64_t clock, timebase;
2507 
2508 	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
2509 	do {
2510 		timebase = getCpuDatap()->cpu_base_timebase;
2511 		os_compiler_barrier();
2512 		clock = speculative_timebase();
2513 
2514 		os_compiler_barrier();
2515 	} while (getCpuDatap()->cpu_base_timebase != timebase);
2516 
2517 	return clock + timebase;
2518 }
2519 
2520 uint64_t
ml_get_timebase_entropy(void)2521 ml_get_timebase_entropy(void)
2522 {
2523 	return ml_get_speculative_timebase();
2524 }
2525 
2526 uint32_t
ml_get_decrementer(void)2527 ml_get_decrementer(void)
2528 {
2529 	cpu_data_t *cdp = getCpuDatap();
2530 	uint32_t dec;
2531 
2532 	assert(ml_get_interrupts_enabled() == FALSE);
2533 
2534 	if (cdp->cpu_get_decrementer_func) {
2535 		dec = cdp->cpu_get_decrementer_func();
2536 	} else {
2537 		uint64_t wide_val;
2538 
2539 		wide_val = __builtin_arm_rsr64(KERNEL_CNTV_TVAL_EL0);
2540 		dec = (uint32_t)wide_val;
2541 		assert(wide_val == (uint64_t)dec);
2542 	}
2543 
2544 	return dec;
2545 }
2546 
2547 boolean_t
ml_get_timer_pending(void)2548 ml_get_timer_pending(void)
2549 {
2550 	uint64_t cntv_ctl = __builtin_arm_rsr64(KERNEL_CNTV_CTL_EL0);
2551 	return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
2552 }
2553 
2554 __attribute__((noreturn))
2555 void
platform_syscall(arm_saved_state_t * state)2556 platform_syscall(arm_saved_state_t *state)
2557 {
2558 	uint32_t code;
2559 
2560 #define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2561 
2562 	code = (uint32_t)get_saved_state_reg(state, 3);
2563 
2564 	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_START,
2565 	    get_saved_state_reg(state, 0),
2566 	    get_saved_state_reg(state, 1),
2567 	    get_saved_state_reg(state, 2));
2568 
2569 	switch (code) {
2570 	case 2:
2571 		/* set cthread */
2572 		platform_syscall_kprintf("set cthread self.\n");
2573 		thread_set_cthread_self(get_saved_state_reg(state, 0));
2574 		break;
2575 	case 3:
2576 		/* get cthread */
2577 		platform_syscall_kprintf("get cthread self.\n");
2578 		set_user_saved_state_reg(state, 0, thread_get_cthread_self());
2579 		break;
2580 	case 0: /* I-Cache flush (removed) */
2581 	case 1: /* D-Cache flush (removed) */
2582 	default:
2583 		platform_syscall_kprintf("unknown: %d\n", code);
2584 		break;
2585 	}
2586 
2587 	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_END,
2588 	    get_saved_state_reg(state, 0));
2589 
2590 	thread_exception_return();
2591 }
2592 
2593 static void
_enable_timebase_event_stream(uint32_t bit_index)2594 _enable_timebase_event_stream(uint32_t bit_index)
2595 {
2596 	if (bit_index >= 64) {
2597 		panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2598 	}
2599 
2600 	uint64_t cntkctl = __builtin_arm_rsr64(KERNEL_CNTKCTL_EL1);
2601 
2602 	cntkctl |= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2603 	cntkctl |= CNTKCTL_EL1_EVNTEN;
2604 	cntkctl |= CNTKCTL_EL1_EVENTDIR; /* 1->0; why not? */
2605 
2606 	/*
2607 	 * If the SOC supports it (and it isn't broken), enable
2608 	 * EL0 access to the timebase registers.
2609 	 */
2610 	if (user_timebase_type() != USER_TIMEBASE_NONE) {
2611 		cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2612 	}
2613 
2614 	__builtin_arm_wsr64(KERNEL_CNTKCTL_EL1, cntkctl);
2615 
2616 #if HAS_APPLE_GENERIC_TIMER
2617 	/* Enable EL0 access to the ARM timebase registers too */
2618 	uint64_t arm_cntkctl = __builtin_arm_rsr64("CNTKCTL_EL1");
2619 	arm_cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2620 	__builtin_arm_wsr64("CNTKCTL_EL1", arm_cntkctl);
2621 #endif
2622 }
2623 
2624 /*
2625  * Turn timer on, unmask that interrupt.
2626  */
2627 static void
_enable_virtual_timer(void)2628 _enable_virtual_timer(void)
2629 {
2630 	uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
2631 
2632 	__builtin_arm_wsr64(KERNEL_CNTV_CTL_EL0, cntvctl);
2633 	/* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
2634 	__builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2635 #if HAS_APPLE_GENERIC_TIMER
2636 	__builtin_arm_wsr64("S3_1_C15_C13_4", CNTP_CTL_EL0_IMASKED);
2637 #endif
2638 }
2639 
2640 void
fiq_context_init(boolean_t enable_fiq __unused)2641 fiq_context_init(boolean_t enable_fiq __unused)
2642 {
2643 	/* Interrupts still disabled. */
2644 	assert(ml_get_interrupts_enabled() == FALSE);
2645 	_enable_virtual_timer();
2646 }
2647 
2648 void
wfe_timeout_init(void)2649 wfe_timeout_init(void)
2650 {
2651 	_enable_timebase_event_stream(arm64_eventi);
2652 }
2653 
2654 /**
2655  * Configures, but does not enable, the WFE event stream. The event stream
2656  * generates an event at a set interval to act as a timeout for WFEs.
2657  *
2658  * This function sets the static global variable arm64_eventi to be the proper
2659  * bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2660  * period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2661  * is used by wfe_timeout_init to actually poke the registers and enable the
2662  * event stream.
2663  *
2664  * The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2665  * is the trigger for the system to generate an event. The trigger can occur on
2666  * either the rising or falling edge of the bit depending on the value of
2667  * CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2668  * falling edge (1->0) transition to generate events.
2669  */
2670 void
wfe_timeout_configure(void)2671 wfe_timeout_configure(void)
2672 {
2673 	/* Could fill in our own ops here, if we needed them */
2674 	uint64_t        ticks_per_sec, ticks_per_event, events_per_sec = 0;
2675 	uint32_t        bit_index;
2676 
2677 	if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
2678 		if (events_per_sec <= 0) {
2679 			events_per_sec = 1;
2680 		} else if (events_per_sec > USEC_PER_SEC) {
2681 			events_per_sec = USEC_PER_SEC;
2682 		}
2683 	} else {
2684 		events_per_sec = USEC_PER_SEC;
2685 	}
2686 	ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2687 	ticks_per_event = ticks_per_sec / events_per_sec;
2688 
2689 	/* Bit index of next power of two greater than ticks_per_event */
2690 	bit_index = flsll(ticks_per_event) - 1;
2691 	/* Round up to next power of two if ticks_per_event is initially power of two */
2692 	if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
2693 		bit_index++;
2694 	}
2695 
2696 	/*
2697 	 * The timer can only trigger on rising or falling edge, not both; we don't
2698 	 * care which we trigger on, but we do need to adjust which bit we are
2699 	 * interested in to account for this.
2700 	 *
2701 	 * In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2702 	 * falling edge of the given bit. Therefore, we must decrement the bit index
2703 	 * by one as when the bit before the one we care about makes a 1 -> 0
2704 	 * transition, the bit we care about makes a 0 -> 1 transition.
2705 	 *
2706 	 * For example if we want an event generated every 8 ticks (if we calculated
2707 	 * a bit_index of 3), we would want the event to be generated whenever the
2708 	 * lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2709 	 * see that the bit at index 2 makes a falling transition in this scenario,
2710 	 * so we would want EVENTI to be 2 instead of 3.
2711 	 */
2712 	if (bit_index != 0) {
2713 		bit_index--;
2714 	}
2715 
2716 	arm64_eventi = bit_index;
2717 }
2718 
2719 boolean_t
ml_delay_should_spin(uint64_t interval)2720 ml_delay_should_spin(uint64_t interval)
2721 {
2722 	cpu_data_t     *cdp = getCpuDatap();
2723 
2724 	if (cdp->cpu_idle_latency) {
2725 		return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2726 	} else {
2727 		/*
2728 		 * Early boot, latency is unknown. Err on the side of blocking,
2729 		 * which should always be safe, even if slow
2730 		 */
2731 		return FALSE;
2732 	}
2733 }
2734 
2735 boolean_t
ml_thread_is64bit(thread_t thread)2736 ml_thread_is64bit(thread_t thread)
2737 {
2738 	return thread_is_64bit_addr(thread);
2739 }
2740 
2741 void
ml_delay_on_yield(void)2742 ml_delay_on_yield(void)
2743 {
2744 #if DEVELOPMENT || DEBUG
2745 	if (yield_delay_us) {
2746 		delay(yield_delay_us);
2747 	}
2748 #endif
2749 }
2750 
2751 void
ml_timer_evaluate(void)2752 ml_timer_evaluate(void)
2753 {
2754 }
2755 
2756 boolean_t
ml_timer_forced_evaluation(void)2757 ml_timer_forced_evaluation(void)
2758 {
2759 	return FALSE;
2760 }
2761 
2762 void
ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)2763 ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2764 {
2765 	/*
2766 	 * For now: update the resource coalition stats of the
2767 	 * current thread's coalition
2768 	 */
2769 	task_coalition_update_gpu_stats(current_task(), gpu_ns_delta);
2770 }
2771 
2772 uint64_t
ml_gpu_stat(__unused thread_t t)2773 ml_gpu_stat(__unused thread_t t)
2774 {
2775 	return 0;
2776 }
2777 
2778 thread_t
current_thread(void)2779 current_thread(void)
2780 {
2781 	return current_thread_fast();
2782 }
2783 
2784 #if defined(HAS_APPLE_PAC)
2785 uint8_t
ml_task_get_disable_user_jop(task_t task)2786 ml_task_get_disable_user_jop(task_t task)
2787 {
2788 	assert(task);
2789 	return task->disable_user_jop;
2790 }
2791 
2792 void
ml_task_set_disable_user_jop(task_t task,uint8_t disable_user_jop)2793 ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2794 {
2795 	assert(task);
2796 	task->disable_user_jop = disable_user_jop;
2797 }
2798 
2799 void
ml_thread_set_disable_user_jop(thread_t thread,uint8_t disable_user_jop)2800 ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2801 {
2802 	assert(thread);
2803 	if (disable_user_jop) {
2804 		thread->machine.arm_machine_flags |= ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2805 	} else {
2806 		thread->machine.arm_machine_flags &= ~ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2807 	}
2808 }
2809 
2810 void
ml_task_set_rop_pid(task_t task,task_t parent_task,boolean_t inherit)2811 ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2812 {
2813 	if (inherit) {
2814 		task->rop_pid = parent_task->rop_pid;
2815 	} else {
2816 		task->rop_pid = early_random();
2817 	}
2818 }
2819 
2820 /**
2821  * jop_pid may be inherited from the parent task or generated inside the shared
2822  * region.  Unfortunately these two parameters are available at very different
2823  * times during task creation, so we need to split this into two steps.
2824  */
2825 void
ml_task_set_jop_pid(task_t task,task_t parent_task,boolean_t inherit,boolean_t disable_user_jop)2826 ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit, boolean_t disable_user_jop)
2827 {
2828 	if (inherit) {
2829 		task->jop_pid = parent_task->jop_pid;
2830 	} else if (disable_user_jop) {
2831 		task->jop_pid = ml_non_arm64e_user_jop_pid();
2832 	} else {
2833 		task->jop_pid = ml_default_jop_pid();
2834 	}
2835 }
2836 
2837 void
ml_task_set_jop_pid_from_shared_region(task_t task,boolean_t disable_user_jop)2838 ml_task_set_jop_pid_from_shared_region(task_t task, boolean_t disable_user_jop)
2839 {
2840 	if (disable_user_jop) {
2841 		task->jop_pid = ml_non_arm64e_user_jop_pid();
2842 		return;
2843 	}
2844 
2845 	vm_shared_region_t sr = vm_shared_region_get(task);
2846 	/*
2847 	 * If there's no shared region, we can assign the key arbitrarily.  This
2848 	 * typically happens when Mach-O image activation failed part of the way
2849 	 * through, and this task is in the middle of dying with SIGKILL anyway.
2850 	 */
2851 	if (__improbable(!sr)) {
2852 		task->jop_pid = early_random();
2853 		return;
2854 	}
2855 	vm_shared_region_deallocate(sr);
2856 
2857 	/*
2858 	 * Similarly we have to worry about jetsam having killed the task and
2859 	 * already cleared the shared_region_id.
2860 	 */
2861 	task_lock(task);
2862 	if (task->shared_region_id != NULL) {
2863 		task->jop_pid = shared_region_find_key(task->shared_region_id);
2864 	} else {
2865 		task->jop_pid = early_random();
2866 	}
2867 	task_unlock(task);
2868 }
2869 
2870 void
ml_thread_set_jop_pid(thread_t thread,task_t task)2871 ml_thread_set_jop_pid(thread_t thread, task_t task)
2872 {
2873 	thread->machine.jop_pid = task->jop_pid;
2874 }
2875 #endif /* defined(HAS_APPLE_PAC) */
2876 
2877 #if DEVELOPMENT || DEBUG
2878 static uint64_t minor_badness_suffered = 0;
2879 #endif
2880 void
ml_report_minor_badness(uint32_t __unused badness_id)2881 ml_report_minor_badness(uint32_t __unused badness_id)
2882 {
2883 	#if DEVELOPMENT || DEBUG
2884 	(void)os_atomic_or(&minor_badness_suffered, 1ULL << badness_id, relaxed);
2885 	#endif
2886 }
2887 
2888 #if HAS_APPLE_PAC
2889 /**
2890  * Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2891  */
2892 void *
ml_poison_ptr(void * ptr,ptrauth_key key)2893 ml_poison_ptr(void *ptr, ptrauth_key key)
2894 {
2895 	bool b_key = key & (1ULL << 0);
2896 	uint64_t error_code;
2897 	if (b_key) {
2898 		error_code = 2;
2899 	} else {
2900 		error_code = 1;
2901 	}
2902 
2903 	bool kernel_pointer = (uintptr_t)ptr & (1ULL << 55);
2904 	bool data_key = key & (1ULL << 1);
2905 	/* When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters */
2906 	bool tbi = data_key && !kernel_pointer;
2907 	unsigned int poison_shift;
2908 	if (tbi) {
2909 		poison_shift = 53;
2910 	} else {
2911 		poison_shift = 61;
2912 	}
2913 
2914 	uintptr_t poisoned = (uintptr_t)ptr;
2915 	poisoned &= ~(3ULL << poison_shift);
2916 	poisoned |= error_code << poison_shift;
2917 	return (void *)poisoned;
2918 }
2919 #endif /* HAS_APPLE_PAC */
2920 
2921 #ifdef CONFIG_XNUPOST
2922 void
ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_addr)2923 ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2924 {
2925 	thread_t thread = current_thread();
2926 	thread->machine.expected_fault_handler = expected_fault_handler;
2927 	thread->machine.expected_fault_addr = expected_fault_addr;
2928 	thread->machine.expected_fault_pc = 0;
2929 }
2930 
2931 /** Expect an exception to be thrown at EXPECTED_FAULT_PC */
2932 void
ml_expect_fault_pc_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_pc)2933 ml_expect_fault_pc_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_pc)
2934 {
2935 	thread_t thread = current_thread();
2936 	thread->machine.expected_fault_handler = expected_fault_handler;
2937 	thread->machine.expected_fault_addr = 0;
2938 	uintptr_t raw_func = (uintptr_t)ptrauth_strip(
2939 		(void *)expected_fault_pc,
2940 		ptrauth_key_function_pointer);
2941 	thread->machine.expected_fault_pc = raw_func;
2942 }
2943 
2944 void
ml_expect_fault_end(void)2945 ml_expect_fault_end(void)
2946 {
2947 	thread_t thread = current_thread();
2948 	thread->machine.expected_fault_handler = NULL;
2949 	thread->machine.expected_fault_addr = 0;
2950 	thread->machine.expected_fault_pc = 0;
2951 }
2952 #endif /* CONFIG_XNUPOST */
2953 
2954 void
ml_hibernate_active_pre(void)2955 ml_hibernate_active_pre(void)
2956 {
2957 #if HIBERNATION
2958 	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2959 
2960 		hibernate_rebuild_vm_structs();
2961 
2962 #if CONFIG_SPTM
2963 		/* Tell the pmap that hibernation restoration has started. */
2964 		extern secure_hmac_hib_state_t pmap_hibernation_state;
2965 		pmap_hibernation_state = SECURE_HMAC_HIB_RESTORE;
2966 #endif /* CONFIG_SPTM */
2967 	}
2968 #endif /* HIBERNATION */
2969 }
2970 
2971 void
ml_hibernate_active_post(void)2972 ml_hibernate_active_post(void)
2973 {
2974 #if HIBERNATION
2975 	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2976 		hibernate_machine_init();
2977 		hibernate_vm_lock_end();
2978 		current_cpu_datap()->cpu_hibernate = 0;
2979 	}
2980 #endif /* HIBERNATION */
2981 }
2982 
2983 /**
2984  * Return back a machine-dependent array of address space regions that should be
2985  * reserved by the VM (pre-mapped in the address space). This will prevent user
2986  * processes from allocating or deallocating from within these regions.
2987  *
2988  * @param vm_is64bit True if the process has a 64-bit address space.
2989  * @param regions An out parameter representing an array of regions to reserve.
2990  *
2991  * @return The number of reserved regions returned through `regions`.
2992  */
2993 size_t
ml_get_vm_reserved_regions(bool vm_is64bit,const struct vm_reserved_region ** regions)2994 ml_get_vm_reserved_regions(bool vm_is64bit, const struct vm_reserved_region **regions)
2995 {
2996 	assert(regions != NULL);
2997 
2998 	/**
2999 	 * Reserved regions only apply to 64-bit address spaces. This is because
3000 	 * we only expect to grow the maximum user VA address on 64-bit address spaces
3001 	 * (we've essentially already reached the max for 32-bit spaces). The reserved
3002 	 * regions should safely fall outside of the max user VA for 32-bit processes.
3003 	 */
3004 	if (vm_is64bit) {
3005 		*regions = vm_reserved_regions;
3006 		return ARRAY_COUNT(vm_reserved_regions);
3007 	} else {
3008 		/* Don't reserve any VA regions on arm64_32 processes. */
3009 		*regions = NULL;
3010 		return 0;
3011 	}
3012 }
3013 
3014 /* These WFE recommendations are expected to be updated on a relatively
3015  * infrequent cadence, possibly from a different cluster, hence
3016  * false cacheline sharing isn't expected to be material
3017  */
3018 static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
3019 
3020 uint32_t
ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id,uint64_t wfe_timeout_abstime_interval,__unused uint64_t wfe_hint_flags)3021 ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
3022 {
3023 	assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
3024 	assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
3025 	os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
3026 	return 0; /* Success */
3027 }
3028 
3029 #if DEVELOPMENT || DEBUG
3030 int wfe_rec_max = 0;
3031 int wfe_rec_none = 0;
3032 uint64_t wfe_rec_override_mat = 0;
3033 uint64_t wfe_rec_clamp = 0;
3034 #endif
3035 
3036 uint64_t
ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)3037 ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
3038 {
3039 	/* This and its consumer does not synchronize vis-a-vis updates
3040 	 * of the recommendation; races are acceptable.
3041 	 */
3042 	uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
3043 #if DEVELOPMENT || DEBUG
3044 	if (wfe_rec_clamp) {
3045 		wfet = MIN(wfe_rec_clamp, wfet);
3046 	}
3047 
3048 	if (wfe_rec_max) {
3049 		for (int i = 0; i < MAX_CPU_CLUSTERS; i++) {
3050 			if (arm64_cluster_wfe_recs[i] > wfet) {
3051 				wfet = arm64_cluster_wfe_recs[i];
3052 			}
3053 		}
3054 	}
3055 
3056 	if (wfe_rec_none) {
3057 		wfet = 0;
3058 	}
3059 
3060 	if (wfe_rec_override_mat) {
3061 		wfet = wfe_rec_override_mat;
3062 	}
3063 #endif
3064 	return wfet;
3065 }
3066 
3067 __pure2 bool
ml_addr_in_non_xnu_stack(__unused uintptr_t addr)3068 ml_addr_in_non_xnu_stack(__unused uintptr_t addr)
3069 {
3070 #if CONFIG_SPTM
3071 	/**
3072 	 * If the address is within one of the SPTM-allocated per-cpu stacks, then
3073 	 * return true.
3074 	 */
3075 	if ((addr >= SPTMArgs->cpu_stack_papt_start) &&
3076 	    (addr < SPTMArgs->cpu_stack_papt_end)) {
3077 		return true;
3078 	}
3079 
3080 	/**
3081 	 * If the address is within one of the TXM thread stacks, then return true.
3082 	 * The SPTM guarantees that these stacks are virtually contiguous.
3083 	 */
3084 	if ((addr >= SPTMArgs->txm_thread_stacks[0]) &&
3085 	    (addr < SPTMArgs->txm_thread_stacks[MAX_CPUS - 1])) {
3086 		return true;
3087 	}
3088 
3089 	return false;
3090 #elif XNU_MONITOR
3091 	return (addr >= (uintptr_t)pmap_stacks_start) && (addr < (uintptr_t)pmap_stacks_end);
3092 #else
3093 	return false;
3094 #endif /* CONFIG_SPTM || XNU_MONITOR */
3095 }
3096 
3097 uint64_t
ml_get_backtrace_pc(struct arm_saved_state * state)3098 ml_get_backtrace_pc(struct arm_saved_state *state)
3099 {
3100 	assert((state != NULL) && is_saved_state64(state));
3101 
3102 #if CONFIG_SPTM
3103 	/**
3104 	 * On SPTM-based systems, when a non-XNU domain (e.g., SPTM) is interrupted,
3105 	 * the PC value saved into the state is not the actual PC at the interrupted
3106 	 * point, but a fixed value to a handler that knows how to re-enter the
3107 	 * interrupted domain. The interrupted domain's actual PC value is saved
3108 	 * into x14, so let's return that instead.
3109 	 */
3110 	if (ml_addr_in_non_xnu_stack(get_saved_state_fp(state))) {
3111 		return saved_state64(state)->x[14];
3112 	}
3113 #endif /* CONFIG_SPTM */
3114 
3115 	return get_saved_state_pc(state);
3116 }
3117 
3118 
3119 /**
3120  * Panic because an ARM saved-state accessor expected user saved-state but was
3121  * passed non-user saved-state.
3122  *
3123  * @param ss invalid saved-state (CPSR.M != EL0)
3124  */
3125 void
ml_panic_on_invalid_old_cpsr(const arm_saved_state_t * ss)3126 ml_panic_on_invalid_old_cpsr(const arm_saved_state_t *ss)
3127 {
3128 	panic("invalid CPSR in user saved-state %p", ss);
3129 }
3130 
3131 /**
3132  * Panic because an ARM saved-state accessor was passed user saved-state and
3133  * asked to assign a non-user CPSR.
3134  *
3135  * @param ss original EL0 saved-state
3136  * @param cpsr invalid new CPSR value (CPSR.M != EL0)
3137  */
3138 void
ml_panic_on_invalid_new_cpsr(const arm_saved_state_t * ss,uint32_t cpsr)3139 ml_panic_on_invalid_new_cpsr(const arm_saved_state_t *ss, uint32_t cpsr)
3140 {
3141 	panic("attempt to set non-user CPSR %#010x on user saved-state %p", cpsr, ss);
3142 }
3143 
3144 
3145 /**
3146  * Explicitly preallocates a floating point save area.
3147  * This is a noop on ARM because preallocation isn't required at this time.
3148  */
3149 void
ml_fp_save_area_prealloc(void)3150 ml_fp_save_area_prealloc(void)
3151 {
3152 }
3153 
3154 
3155 void
ml_task_post_signature_processing_hook(__unused task_t task)3156 ml_task_post_signature_processing_hook(__unused task_t task)
3157 {
3158 	/**
3159 	 * Have an acquire barrier here to make sure the machine flags read that is going
3160 	 * to happen below is not speculated before the task->t_returnwaitflags earlier
3161 	 * in task_wait_to_return().
3162 	 */
3163 	os_atomic_thread_fence(acquire);
3164 
3165 }
3166 
3167 
3168 #if DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT
3169 static bool SECURITY_READ_ONLY_LATE(_unsafe_kernel_text_initialized) = false;
3170 static bool SECURITY_READ_ONLY_LATE(_unsafe_kernel_text) = false;
3171 
3172 __mockable bool
ml_unsafe_kernel_text(void)3173 ml_unsafe_kernel_text(void)
3174 {
3175 	assert(_unsafe_kernel_text_initialized);
3176 	return _unsafe_kernel_text;
3177 }
3178 
3179 __startup_func
3180 static void
ml_unsafe_kernel_text_init(void)3181 ml_unsafe_kernel_text_init(void)
3182 {
3183 	/* Grab the values written by iBoot. */
3184 
3185 	DTEntry         entry;
3186 	const void      *value;
3187 	unsigned int    size;
3188 	if (SecureDTLookupEntry(0, "/chosen", &entry) == kSuccess &&
3189 	    SecureDTGetProperty(entry, "kernel-ctrr-to-be-enabled", &value, &size) == kSuccess &&
3190 	    size == sizeof(int)) {
3191 		_unsafe_kernel_text_initialized = true;
3192 		_unsafe_kernel_text = (0 == *(const int *)value);
3193 	}
3194 }
3195 STARTUP(TUNABLES, STARTUP_RANK_FIRST, ml_unsafe_kernel_text_init);
3196 
3197 #else /* DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT */
3198 bool
ml_unsafe_kernel_text(void)3199 ml_unsafe_kernel_text(void)
3200 {
3201 	/* Kernel text is never writable under these configs. */
3202 	return false;
3203 }
3204 #endif /* DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT */
3205