xref: /xnu-11417.101.15/osfmk/arm64/machine_routines.c (revision e3723e1f17661b24996789d8afc084c0c3303b26)
1 /*
2  * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <arm64/machine_machdep.h>
30 #include <arm64/proc_reg.h>
31 #include <arm/machine_cpu.h>
32 #include <arm/cpu_internal.h>
33 #include <arm/cpuid.h>
34 #include <arm/cpu_data.h>
35 #include <arm/cpu_data_internal.h>
36 #include <arm/caches_internal.h>
37 #include <arm/misc_protos.h>
38 #include <arm/machdep_call.h>
39 #include <arm/machine_routines.h>
40 #include <arm/rtclock.h>
41 #include <arm/cpuid_internal.h>
42 #include <arm/cpu_capabilities.h>
43 #include <console/serial_protos.h>
44 #include <kern/machine.h>
45 #include <kern/misc_protos.h>
46 #include <prng/random.h>
47 #include <kern/startup.h>
48 #include <kern/thread.h>
49 #include <kern/timer_queue.h>
50 #include <mach/machine.h>
51 #include <machine/atomic.h>
52 #include <machine/config.h>
53 #include <vm/pmap.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_shared_region_xnu.h>
56 #include <vm/vm_map_xnu.h>
57 #include <vm/vm_kern_xnu.h>
58 #include <sys/codesign.h>
59 #include <sys/kdebug.h>
60 #include <kern/coalition.h>
61 #include <pexpert/device_tree.h>
62 #include <pexpert/arm64/board_config.h>
63 
64 #include <IOKit/IOPlatformExpert.h>
65 #if HIBERNATION
66 #include <IOKit/IOHibernatePrivate.h>
67 #endif /* HIBERNATION */
68 
69 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
70 #include <arm64/amcc_rorgn.h>
71 #endif
72 
73 
74 #if CONFIG_SPTM
75 #include <arm64/sptm/sptm.h>
76 #endif /* CONFIG_SPTM */
77 
78 #include <libkern/section_keywords.h>
79 
80 /**
81  * On supported hardware, debuggable builds make the HID bits read-only
82  * without locking them.  This lets people manually modify HID bits while
83  * debugging, since they can use a debugging tool to first reset the HID
84  * bits back to read/write.  However it will still catch xnu changes that
85  * accidentally write to HID bits after they've been made read-only.
86  */
87 SECURITY_READ_ONLY_LATE(bool) skip_spr_lockdown_glb = 0;
88 
89 /*
90  * On some SoCs, PIO lockdown is applied in assembly in early boot by
91  * secondary CPUs.
92  * Since the cluster_pio_ro_ctl value is dynamic, it is stored here by the
93  * primary CPU so that it doesn't have to be computed each time by the
94  * startup code.
95  */
96 SECURITY_READ_ONLY_LATE(uint64_t) cluster_pio_ro_ctl_mask_glb = 0;
97 
98 #if CONFIG_CPU_COUNTERS
99 #include <kern/kpc.h>
100 #endif /* CONFIG_CPU_COUNTERS */
101 
102 #define MPIDR_CPU_ID(mpidr_el1_val)             (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
103 #define MPIDR_CLUSTER_ID(mpidr_el1_val)         (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
104 
105 #if HAS_CLUSTER
106 static uint8_t cluster_initialized = 0;
107 #endif
108 
109 MACHINE_TIMEOUT_DEV_WRITEABLE(LockTimeOut, "lock", 6e6 /* 0.25s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
110 machine_timeout_t LockTimeOutUsec; // computed in ml_init_lock_timeout
111 
112 MACHINE_TIMEOUT_DEV_WRITEABLE(TLockTimeOut, "ticket-lock", 3e6 /* 0.125s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
113 
114 TUNABLE_DEV_WRITEABLE(uint64_t, MutexSpin, "mutex-spin", 240 /* 10us */);
115 
116 uint64_t low_MutexSpin;
117 int64_t high_MutexSpin;
118 
119 
120 
121 static uint64_t ml_wfe_hint_max_interval;
122 #define MAX_WFE_HINT_INTERVAL_US (500ULL)
123 
124 /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
125 TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
126 
127 extern vm_offset_t   segLOWEST;
128 extern vm_offset_t   segLOWESTTEXT;
129 extern vm_offset_t   segLASTB;
130 extern unsigned long segSizeLAST;
131 
132 /* ARM64 specific bounds; used to test for presence in the kernelcache. */
133 extern vm_offset_t   vm_kernelcache_base;
134 extern vm_offset_t   vm_kernelcache_top;
135 
136 /* Location of the physmap / physical aperture */
137 extern uint64_t physmap_base;
138 
139 #if defined(CONFIG_SPTM)
140 extern const arm_physrange_t *arm_vm_kernelcache_ranges;
141 extern int arm_vm_kernelcache_numranges;
142 #else /* defined(CONFIG_SPTM) */
143 extern vm_offset_t arm_vm_kernelcache_phys_start;
144 extern vm_offset_t arm_vm_kernelcache_phys_end;
145 #endif /* defined(CONFIG_SPTM) */
146 
147 #if defined(HAS_IPI)
148 unsigned int gFastIPI = 1;
149 #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
150 static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
151     kDeferredIPITimerDefault);
152 #endif /* defined(HAS_IPI) */
153 
154 thread_t Idle_context(void);
155 
156 SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
157 
158 SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
159 SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
160 SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
161 	.version = CPU_TOPOLOGY_VERSION,
162 	.cpus = topology_cpu_array,
163 	.clusters = topology_cluster_array,
164 };
165 
166 _Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
167 
168 /**
169  * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
170  * entries of an arbitrary data type.  This is intended for use by specialized consumers
171  * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
172  * as follows:
173  *	hypothetical_array[cluster_offsets[AFF1] + AFF0]
174  * Most consumers should instead use general-purpose facilities such as PERCPU or
175  * ml_get_cpu_number().
176  */
177 SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
178 
179 SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
180 
181 extern uint32_t lockdown_done;
182 
183 /**
184  * Represents regions of virtual address space that should be reserved
185  * (pre-mapped) in each user address space.
186  */
187 static const struct vm_reserved_region vm_reserved_regions[] = {
188 	{
189 		.vmrr_name = "GPU Carveout",
190 		.vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
191 		.vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
192 	},
193 	/*
194 	 * Reserve the virtual memory space representing the commpage nesting region
195 	 * to prevent user processes from allocating memory within it. The actual
196 	 * page table entries for the commpage are inserted by vm_commpage_enter().
197 	 * This vm_map_enter() just prevents userspace from allocating/deallocating
198 	 * anything within the entire commpage nested region.
199 	 */
200 	{
201 		.vmrr_name = "commpage nesting",
202 		.vmrr_addr = _COMM_PAGE64_NESTING_START,
203 		.vmrr_size = _COMM_PAGE64_NESTING_SIZE
204 	}
205 };
206 
207 uint32_t get_arm_cpu_version(void);
208 
209 
210 #if defined(HAS_IPI)
211 static inline void
ml_cpu_signal_type(unsigned int cpu_mpidr,uint32_t type)212 ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
213 {
214 #if HAS_CLUSTER
215 	uint64_t local_mpidr;
216 	/* NOTE: this logic expects that we are called in a non-preemptible
217 	 * context, or at least one in which the calling thread is bound
218 	 * to a single CPU.  Otherwise we may migrate between choosing which
219 	 * IPI mechanism to use and issuing the IPI. */
220 	MRS(local_mpidr, "MPIDR_EL1");
221 	if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
222 		uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
223 		MSR("S3_5_C15_C0_0", x);
224 	} else {
225 		#define IPI_RR_TARGET_CLUSTER_SHIFT 16
226 		uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
227 		MSR("S3_5_C15_C0_1", x);
228 	}
229 #else
230 	uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
231 	MSR("S3_5_C15_C0_1", x);
232 #endif
233 	/* The recommended local/global IPI sequence is:
234 	 *   DSB <sys> (This ensures visibility of e.g. older stores to the
235 	 *     pending CPU signals bit vector in DRAM prior to IPI reception,
236 	 *     and is present in cpu_signal_internal())
237 	 *   MSR S3_5_C15_C0_1, Xt
238 	 *   ISB
239 	 */
240 	__builtin_arm_isb(ISB_SY);
241 }
242 #endif
243 
244 #if !defined(HAS_IPI)
245 __dead2
246 #endif
247 void
ml_cpu_signal(unsigned int cpu_mpidr __unused)248 ml_cpu_signal(unsigned int cpu_mpidr __unused)
249 {
250 #if defined(HAS_IPI)
251 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
252 #else
253 	panic("Platform does not support ACC Fast IPI");
254 #endif
255 }
256 
257 #if !defined(HAS_IPI)
258 __dead2
259 #endif
260 void
ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)261 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
262 {
263 #if defined(HAS_IPI)
264 	/* adjust IPI_CR timer countdown value for deferred IPI
265 	 * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
266 	 * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
267 	 *
268 	 * global register, should only require a single write to update all
269 	 * CPU cores: from Skye ACC user spec section 5.7.3.3
270 	 *
271 	 * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
272 	 * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
273 	 */
274 	uint64_t abstime;
275 
276 	nanoseconds_to_absolutetime(nanosecs, &abstime);
277 
278 	abstime = MIN(abstime, 0xFFFF);
279 
280 	/* update deferred_ipi_timer_ns with the new clamped value */
281 	absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
282 
283 	MSR("S3_5_C15_C3_1", abstime);
284 #else
285 	(void)nanosecs;
286 	panic("Platform does not support ACC Fast IPI");
287 #endif
288 }
289 
290 uint64_t
ml_cpu_signal_deferred_get_timer()291 ml_cpu_signal_deferred_get_timer()
292 {
293 #if defined(HAS_IPI)
294 	return deferred_ipi_timer_ns;
295 #else
296 	return 0;
297 #endif
298 }
299 
300 #if !defined(HAS_IPI)
301 __dead2
302 #endif
303 void
ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)304 ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
305 {
306 #if defined(HAS_IPI)
307 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
308 #else
309 	panic("Platform does not support ACC Fast IPI deferral");
310 #endif
311 }
312 
313 #if !defined(HAS_IPI)
314 __dead2
315 #endif
316 void
ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)317 ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
318 {
319 #if defined(HAS_IPI)
320 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
321 #else
322 	panic("Platform does not support ACC Fast IPI retraction");
323 #endif
324 }
325 
326 extern uint32_t idle_proximate_io_wfe_unmasked;
327 
328 #define CPUPM_IDLE_WFE 0x5310300
329 static bool
wfe_process_recommendation(void)330 wfe_process_recommendation(void)
331 {
332 	bool ipending = false;
333 	if (__probable(idle_proximate_io_wfe_unmasked == 1)) {
334 		/* Check for an active perf. controller generated
335 		 * WFE recommendation for this cluster.
336 		 */
337 		cpu_data_t *cdp = getCpuDatap();
338 		uint32_t cid = cdp->cpu_cluster_id;
339 		uint64_t wfe_ttd = 0;
340 		uint64_t wfe_deadline = 0;
341 
342 		if ((wfe_ttd = ml_cluster_wfe_timeout(cid)) != 0) {
343 			wfe_deadline = mach_absolute_time() + wfe_ttd;
344 		}
345 
346 		if (wfe_deadline != 0) {
347 			/* Poll issuing event-bounded WFEs until an interrupt
348 			 * arrives or the WFE recommendation expires
349 			 */
350 #if DEVELOPMENT || DEBUG
351 			uint64_t wc = cdp->wfe_count;
352 			KDBG(CPUPM_IDLE_WFE | DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
353 #endif
354 			/* Issue WFE until the recommendation expires,
355 			 * with IRQs unmasked.
356 			 */
357 			ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true, true);
358 #if DEVELOPMENT || DEBUG
359 			KDBG(CPUPM_IDLE_WFE | DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
360 #endif
361 		}
362 	}
363 	return ipending;
364 }
365 
366 void
machine_idle(void)367 machine_idle(void)
368 {
369 	/* Interrupts are expected to be masked on entry or re-entry via
370 	 * Idle_load_context()
371 	 */
372 	assert((__builtin_arm_rsr("DAIF") & DAIF_STANDARD_DISABLE) == DAIF_STANDARD_DISABLE);
373 	/* Check for, and act on, a WFE recommendation.
374 	 * Bypasses context spill/fill for a minor perf. increment.
375 	 * May unmask and restore IRQ+FIQ mask.
376 	 */
377 	if (wfe_process_recommendation() == false) {
378 		/* If WFE recommendation absent, or WFE deadline
379 		 * arrived with no interrupt pending/processed,
380 		 * fall back to WFI.
381 		 */
382 		Idle_context();
383 	}
384 	__builtin_arm_wsr("DAIFClr", DAIFSC_STANDARD_DISABLE);
385 }
386 
387 void
OSSynchronizeIO(void)388 OSSynchronizeIO(void)
389 {
390 	__builtin_arm_dsb(DSB_SY);
391 }
392 
393 uint64_t
get_aux_control(void)394 get_aux_control(void)
395 {
396 	uint64_t        value;
397 
398 	MRS(value, "ACTLR_EL1");
399 	return value;
400 }
401 
402 uint64_t
get_mmu_control(void)403 get_mmu_control(void)
404 {
405 	uint64_t        value;
406 
407 	MRS(value, "SCTLR_EL1");
408 	return value;
409 }
410 
411 uint64_t
get_tcr(void)412 get_tcr(void)
413 {
414 	uint64_t        value;
415 
416 	MRS(value, "TCR_EL1");
417 	return value;
418 }
419 
420 boolean_t
ml_get_interrupts_enabled(void)421 ml_get_interrupts_enabled(void)
422 {
423 	uint64_t        value;
424 
425 	MRS(value, "DAIF");
426 	if ((value & DAIF_STANDARD_DISABLE) == DAIF_STANDARD_DISABLE) {
427 		return FALSE;
428 	}
429 	return TRUE;
430 }
431 
432 pmap_paddr_t
get_mmu_ttb(void)433 get_mmu_ttb(void)
434 {
435 	pmap_paddr_t    value;
436 
437 	MRS(value, "TTBR0_EL1");
438 	return value;
439 }
440 
441 uint32_t
get_arm_cpu_version(void)442 get_arm_cpu_version(void)
443 {
444 	uint32_t value = machine_read_midr();
445 
446 	/* Compose the register values into 8 bits; variant[7:4], revision[3:0]. */
447 	return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
448 }
449 
450 bool
ml_feature_supported(uint64_t feature_bit)451 ml_feature_supported(uint64_t feature_bit)
452 {
453 	uint64_t aidr_el1_value = 0;
454 
455 	MRS(aidr_el1_value, "AIDR_EL1");
456 
457 #ifdef APPLEAVALANCHE
458 #endif // APPLEAVALANCHE
459 
460 	return aidr_el1_value & feature_bit;
461 }
462 
463 /*
464  * user_cont_hwclock_allowed()
465  *
466  * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
467  * as a continuous time source (e.g. from mach_continuous_time)
468  */
469 boolean_t
user_cont_hwclock_allowed(void)470 user_cont_hwclock_allowed(void)
471 {
472 #if HAS_CONTINUOUS_HWCLOCK
473 	return TRUE;
474 #else
475 	return FALSE;
476 #endif
477 }
478 
479 /*
480  * user_timebase_type()
481  *
482  * Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
483  *
484  * USER_TIMEBASE_NONE: EL0 has no access to timebase register
485  * USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
486  * USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
487  *
488  */
489 
490 uint8_t
user_timebase_type(void)491 user_timebase_type(void)
492 {
493 #if HAS_ACNTVCT
494 	return USER_TIMEBASE_NOSPEC_APPLE;
495 #elif HAS_APPLE_GENERIC_TIMER
496 	// Conveniently, S3_4_C15_C10_6 and ACNTVCT_EL0 have identical encodings
497 	return USER_TIMEBASE_NOSPEC_APPLE;
498 #elif __ARM_ARCH_8_6__
499 	return USER_TIMEBASE_NOSPEC;
500 #else
501 	return USER_TIMEBASE_SPEC;
502 #endif
503 }
504 
505 void
machine_startup(__unused boot_args * args)506 machine_startup(__unused boot_args * args)
507 {
508 #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
509 	if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
510 		gFastIPI = 1;
511 	}
512 #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
513 
514 
515 	machine_conf();
516 
517 
518 	/*
519 	 * Kick off the kernel bootstrap.
520 	 */
521 	kernel_bootstrap();
522 	/* NOTREACHED */
523 }
524 
525 typedef void (*invalidate_fn_t)(void);
526 
527 static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
528 
529 void set_invalidate_hmac_function(invalidate_fn_t fn);
530 
531 void
set_invalidate_hmac_function(invalidate_fn_t fn)532 set_invalidate_hmac_function(invalidate_fn_t fn)
533 {
534 	if (NULL != invalidate_hmac_function) {
535 		panic("Invalidate HMAC function already set");
536 	}
537 
538 	invalidate_hmac_function = fn;
539 }
540 
541 bool
ml_is_secure_hib_supported(void)542 ml_is_secure_hib_supported(void)
543 {
544 	return false;
545 }
546 
547 void
machine_lockdown(void)548 machine_lockdown(void)
549 {
550 
551 #if CONFIG_SPTM
552 
553 	/**
554 	 * On devices that make use of the SPTM, the SPTM is responsible for
555 	 * managing system register locks. Due to this, we skip the call to
556 	 * spr_lockdown() below.
557 	 */
558 #else
559 #endif
560 
561 	arm_vm_prot_finalize(PE_state.bootArgs);
562 
563 #if CONFIG_KERNEL_INTEGRITY
564 #if KERNEL_INTEGRITY_WT
565 	/* Watchtower
566 	 *
567 	 * Notify the monitor about the completion of early kernel bootstrap.
568 	 * From this point forward it will enforce the integrity of kernel text,
569 	 * rodata and page tables.
570 	 */
571 
572 #ifdef MONITOR
573 	monitor_call(MONITOR_LOCKDOWN, 0, 0, 0);
574 #endif
575 #endif /* KERNEL_INTEGRITY_WT */
576 
577 #if CONFIG_SPTM
578 	extern void pmap_prepare_commpages(void);
579 	pmap_prepare_commpages();
580 
581 	/**
582 	 * sptm_lockdown_xnu() disables preemption like all SPTM calls, but may take
583 	 * a fair amount of time as it involves retyping a large number of pages.
584 	 * This preemption latency is not really a concern since we're still fairly
585 	 * early in the boot process, so just explicitly disable preemption before
586 	 * invoking the SPTM and abandon preemption latency measurements before
587 	 * re-enabling it.
588 	 */
589 	disable_preemption();
590 	/* Signal the SPTM that XNU is ready for RO memory to actually become read-only */
591 	sptm_lockdown_xnu();
592 #if SCHED_HYGIENE_DEBUG
593 	abandon_preemption_disable_measurement();
594 #endif /* SCHED_HYGIENE_DEBUG */
595 	enable_preemption();
596 #else
597 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
598 	/* KTRR
599 	 *
600 	 * Lock physical KTRR region. KTRR region is read-only. Memory outside
601 	 * the region is not executable at EL1.
602 	 */
603 
604 	rorgn_lockdown();
605 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
606 #endif /* CONFIG_SPTM */
607 
608 #if XNU_MONITOR
609 	pmap_lockdown_ppl();
610 #endif
611 
612 #endif /* CONFIG_KERNEL_INTEGRITY */
613 
614 
615 	/**
616 	 * For platforms that use SEP-backed hibernation, invoke kext-provided
617 	 * functionality to invalidate HMAC key in SIO used to sign a variety of
618 	 * data (e.g., the RO region).
619 	 *
620 	 * Just for paranoia's sake, let's make it so that if an attacker is
621 	 * capable of corrupting EDT early that they have to do so in a way that
622 	 * prevents invaldidate_hmac_function from running properly yet still
623 	 * makes it so that the invalidate HMAC function receives an OK
624 	 * response, which seems hard.
625 	 *
626 	 * This only makes sense for PPL-based systems seeing as SPTM-based systems
627 	 * will have iBoot invalidate Key1 for us.
628 	 */
629 	if (NULL != invalidate_hmac_function) {
630 #if !defined(CONFIG_SPTM)
631 		invalidate_hmac_function();
632 #endif /* !defined(CONFIG_SPTM) */
633 	}
634 
635 	lockdown_done = 1;
636 }
637 
638 
639 char           *
machine_boot_info(__unused char * buf,__unused vm_size_t size)640 machine_boot_info(
641 	__unused char *buf,
642 	__unused vm_size_t size)
643 {
644 	return PE_boot_args();
645 }
646 
647 void
machine_cpu_reinit(__unused void * param)648 machine_cpu_reinit(__unused void *param)
649 {
650 	cpu_machine_init();     /* Initialize the processor */
651 	clock_init();           /* Init the clock */
652 }
653 
654 /*
655  *	Routine:        machine_processor_shutdown
656  *	Function:
657  */
658 thread_t
machine_processor_shutdown(__unused thread_t thread,void (* doshutdown)(processor_t),processor_t processor)659 machine_processor_shutdown(
660 	__unused thread_t thread,
661 	void (*doshutdown)(processor_t),
662 	processor_t processor)
663 {
664 	return Shutdown_context(doshutdown, processor);
665 }
666 
667 /*
668  *      Routine:        ml_init_lock_timeout
669  *      Function:
670  */
671 static void __startup_func
ml_init_lock_timeout(void)672 ml_init_lock_timeout(void)
673 {
674 	/*
675 	 * This function is called after STARTUP_SUB_TIMEOUTS
676 	 * initialization, so using the "legacy" boot-args here overrides
677 	 * the ml-timeout-...  configuration. (Given that these boot-args
678 	 * here are usually explicitly specified, this makes sense by
679 	 * overriding ml-timeout-..., which may come from the device tree.
680 	 */
681 
682 	uint64_t lto_timeout_ns;
683 	uint64_t lto_abstime;
684 	uint32_t slto;
685 
686 	if (PE_parse_boot_argn("slto_us", &slto, sizeof(slto))) {
687 		lto_timeout_ns = slto * NSEC_PER_USEC;
688 		nanoseconds_to_absolutetime(lto_timeout_ns, &lto_abstime);
689 		os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
690 	} else {
691 		lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
692 		absolutetime_to_nanoseconds(lto_abstime, &lto_timeout_ns);
693 	}
694 
695 	os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
696 
697 	if (PE_parse_boot_argn("tlto_us", &slto, sizeof(slto))) {
698 		nanoseconds_to_absolutetime(slto * NSEC_PER_USEC, &lto_abstime);
699 		os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
700 	} else if (lto_abstime != 0) {
701 		os_atomic_store(&TLockTimeOut, lto_abstime >> 1, relaxed);
702 	} // else take default from MACHINE_TIMEOUT.
703 
704 	uint64_t mtxspin;
705 	uint64_t mtx_abstime;
706 	if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof(mtxspin))) {
707 		if (mtxspin > USEC_PER_SEC >> 4) {
708 			mtxspin =  USEC_PER_SEC >> 4;
709 		}
710 		nanoseconds_to_absolutetime(mtxspin * NSEC_PER_USEC, &mtx_abstime);
711 		os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
712 	} else {
713 		mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
714 	}
715 
716 	low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
717 	/*
718 	 * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
719 	 * real_ncpus is not set at this time
720 	 *
721 	 * NOTE: active spinning is disabled in arm. It can be activated
722 	 * by setting high_MutexSpin through the sysctl.
723 	 */
724 	high_MutexSpin = low_MutexSpin;
725 
726 	uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
727 	PE_parse_boot_argn("max_wfe_us", &maxwfeus, sizeof(maxwfeus));
728 	nanoseconds_to_absolutetime(maxwfeus * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
729 }
730 STARTUP(TIMEOUTS, STARTUP_RANK_MIDDLE, ml_init_lock_timeout);
731 
732 
733 /*
734  * This is called when all of the ml_processor_info_t structures have been
735  * initialized and all the processors have been started through processor_boot().
736  *
737  * Required by the scheduler subsystem.
738  */
739 void
ml_cpu_init_completed(void)740 ml_cpu_init_completed(void)
741 {
742 	sched_cpu_init_completed();
743 }
744 
745 /*
746  * This tracks which cpus are between ml_cpu_down and ml_cpu_up
747  */
748 _Atomic uint64_t ml_cpu_up_processors = 0;
749 
750 void
ml_cpu_up(void)751 ml_cpu_up(void)
752 {
753 	cpu_data_t *cpu_data_ptr = getCpuDatap();
754 
755 	assert(!bit_test(os_atomic_load(&ml_cpu_up_processors, relaxed), cpu_data_ptr->cpu_number));
756 
757 	atomic_bit_set(&ml_cpu_up_processors, cpu_data_ptr->cpu_number, memory_order_relaxed);
758 }
759 
760 /*
761  * These are called from the machine-independent routine cpu_up()
762  * to perform machine-dependent info updates.
763  *
764  * The update to CPU counts needs to be separate from other actions
765  * because we don't update the counts when CLPC causes temporary
766  * cluster powerdown events, as these must be transparent to the user.
767  */
768 
769 void
ml_cpu_up_update_counts(int cpu_id)770 ml_cpu_up_update_counts(int cpu_id)
771 {
772 	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
773 
774 	os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
775 
776 	os_atomic_inc(&machine_info.physical_cpu, relaxed);
777 	os_atomic_inc(&machine_info.logical_cpu, relaxed);
778 }
779 
780 int
ml_find_next_up_processor()781 ml_find_next_up_processor()
782 {
783 	if (BootCpuData.cpu_running) {
784 		return BootCpuData.cpu_number;
785 	}
786 
787 	int next_active_cpu = lsb_first(os_atomic_load(&ml_cpu_up_processors, relaxed));
788 
789 	if (next_active_cpu == -1) {
790 		assertf(ml_is_quiescing(), "can only have no active CPUs in quiesce state");
791 		next_active_cpu = BootCpuData.cpu_number;
792 	}
793 
794 	return next_active_cpu;
795 }
796 
797 /*
798  * These are called from the machine-independent routine cpu_down()
799  * to perform machine-dependent info updates.
800  *
801  * The update to CPU counts needs to be separate from other actions
802  * because we don't update the counts when CLPC causes temporary
803  * cluster powerdown events, as these must be transparent to the user.
804  */
805 void
ml_cpu_down(void)806 ml_cpu_down(void)
807 {
808 	/*
809 	 * If we want to deal with outstanding IPIs, we need to
810 	 * do relatively early in the processor_doshutdown path,
811 	 * as we pend decrementer interrupts using the IPI
812 	 * mechanism if we cannot immediately service them (if
813 	 * IRQ is masked).  Do so now.
814 	 *
815 	 * We aren't on the interrupt stack here; would it make
816 	 * more sense to disable signaling and then enable
817 	 * interrupts?  It might be a bit cleaner.
818 	 */
819 	cpu_data_t *cpu_data_ptr = getCpuDatap();
820 	cpu_data_ptr->cpu_running = FALSE;
821 
822 	assert((cpu_data_ptr->cpu_signal & SIGPdisabled) == 0);
823 	assert(bit_test(os_atomic_load(&ml_cpu_up_processors, relaxed), cpu_data_ptr->cpu_number));
824 
825 	atomic_bit_clear(&ml_cpu_up_processors, cpu_data_ptr->cpu_number, memory_order_release);
826 
827 	if (cpu_data_ptr == &BootCpuData && ml_is_quiescing()) {
828 		/*
829 		 * This is the boot CPU powering down for S2R, don't try to migrate its timers,
830 		 * because there is nobody else active to migrate it to.
831 		 */
832 		assert3u(os_atomic_load(&ml_cpu_up_processors, relaxed), ==, 0);
833 	} else if (cpu_data_ptr != &BootCpuData || (support_bootcpu_shutdown && !ml_is_quiescing())) {
834 		int next_cpu = ml_find_next_up_processor();
835 
836 		cpu_data_t* new_cpu_datap = cpu_datap(next_cpu);
837 
838 		/*
839 		 * Move all of this cpu's timers to another cpu that has not gone through ml_cpu_down,
840 		 * and poke it in case there's a sooner deadline for it to schedule.
841 		 *
842 		 * This depends on ml_cpu_down never running concurrently, which is guaranteed by
843 		 * the processor_updown_lock.
844 		 */
845 		timer_queue_shutdown(next_cpu, &cpu_data_ptr->rtclock_timer.queue,
846 		    &new_cpu_datap->rtclock_timer.queue);
847 
848 		/*
849 		 * Trigger timer_queue_expire_local to execute on the remote CPU.
850 		 *
851 		 * Because we have interrupts disabled here, we cannot use a
852 		 * standard cpu_xcall, which would deadlock against the stackshot
853 		 * IPI. This must be a fire-and-forget IPI.
854 		 */
855 		kern_return_t rv = cpu_signal(new_cpu_datap, SIGPTimerLocal, NULL, NULL);
856 
857 		if (rv != KERN_SUCCESS) {
858 			panic("ml_cpu_down: cpu_signal of cpu %d failure %d", next_cpu, rv);
859 		}
860 	} else {
861 		panic("boot cpu powering down with nowhere for its timers to go");
862 	}
863 
864 	cpu_signal_handler_internal(TRUE);
865 
866 	/* There should be no more pending IPIs on this core. */
867 	assert3u(getCpuDatap()->cpu_signal, ==, SIGPdisabled);
868 }
869 
870 void
ml_cpu_down_update_counts(int cpu_id)871 ml_cpu_down_update_counts(int cpu_id)
872 {
873 	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
874 
875 	os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
876 
877 	os_atomic_dec(&machine_info.physical_cpu, relaxed);
878 	os_atomic_dec(&machine_info.logical_cpu, relaxed);
879 }
880 
881 
882 unsigned int
ml_get_machine_mem(void)883 ml_get_machine_mem(void)
884 {
885 	return machine_info.memory_size;
886 }
887 
888 __attribute__((noreturn))
889 void
halt_all_cpus(boolean_t reboot)890 halt_all_cpus(boolean_t reboot)
891 {
892 	if (reboot) {
893 		printf("MACH Reboot\n");
894 		PEHaltRestart(kPERestartCPU);
895 	} else {
896 		printf("CPU halted\n");
897 		PEHaltRestart(kPEHaltCPU);
898 	}
899 	while (1) {
900 		;
901 	}
902 }
903 
904 __attribute__((noreturn))
905 void
halt_cpu(void)906 halt_cpu(void)
907 {
908 	halt_all_cpus(FALSE);
909 }
910 
911 /*
912  *	Routine:        machine_signal_idle
913  *	Function:
914  */
915 void
machine_signal_idle(processor_t processor)916 machine_signal_idle(
917 	processor_t processor)
918 {
919 	cpu_signal(processor_to_cpu_datap(processor), SIGPnop, (void *)NULL, (void *)NULL);
920 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
921 }
922 
923 void
machine_signal_idle_deferred(processor_t processor)924 machine_signal_idle_deferred(
925 	processor_t processor)
926 {
927 	cpu_signal_deferred(processor_to_cpu_datap(processor));
928 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
929 }
930 
931 void
machine_signal_idle_cancel(processor_t processor)932 machine_signal_idle_cancel(
933 	processor_t processor)
934 {
935 	cpu_signal_cancel(processor_to_cpu_datap(processor));
936 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
937 }
938 
939 /*
940  *	Routine:        ml_install_interrupt_handler
941  *	Function:	Initialize Interrupt Handler
942  */
943 void
ml_install_interrupt_handler(void * nub,int source,void * target,IOInterruptHandler handler,void * refCon)944 ml_install_interrupt_handler(
945 	void *nub,
946 	int source,
947 	void *target,
948 	IOInterruptHandler handler,
949 	void *refCon)
950 {
951 	cpu_data_t     *cpu_data_ptr;
952 	boolean_t       current_state;
953 
954 	current_state = ml_set_interrupts_enabled(FALSE);
955 	cpu_data_ptr = getCpuDatap();
956 
957 	cpu_data_ptr->interrupt_nub = nub;
958 	cpu_data_ptr->interrupt_source = source;
959 	cpu_data_ptr->interrupt_target = target;
960 	cpu_data_ptr->interrupt_handler = handler;
961 	cpu_data_ptr->interrupt_refCon = refCon;
962 
963 	(void) ml_set_interrupts_enabled(current_state);
964 }
965 
966 /*
967  *	Routine:        ml_init_interrupt
968  *	Function:	Initialize Interrupts
969  */
970 void
ml_init_interrupt(void)971 ml_init_interrupt(void)
972 {
973 #if defined(HAS_IPI)
974 	/*
975 	 * ml_init_interrupt will get called once for each CPU, but this is redundant
976 	 * because there is only one global copy of the register for skye. do it only
977 	 * on the bootstrap cpu
978 	 */
979 	if (getCpuDatap()->cluster_master) {
980 		ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
981 	}
982 #endif
983 }
984 
985 /*
986  *	Routine:        ml_init_timebase
987  *	Function:	register and setup Timebase, Decremeter services
988  */
989 void
ml_init_timebase(void * args,tbd_ops_t tbd_funcs,vm_offset_t int_address,vm_offset_t int_value __unused)990 ml_init_timebase(
991 	void            *args,
992 	tbd_ops_t       tbd_funcs,
993 	vm_offset_t     int_address,
994 	vm_offset_t     int_value __unused)
995 {
996 	cpu_data_t     *cpu_data_ptr;
997 
998 	cpu_data_ptr = (cpu_data_t *)args;
999 
1000 	if ((cpu_data_ptr == &BootCpuData)
1001 	    && (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
1002 		rtclock_timebase_func = *tbd_funcs;
1003 		rtclock_timebase_addr = int_address;
1004 	}
1005 }
1006 
1007 #define ML_READPROP_MANDATORY UINT64_MAX
1008 
1009 static uint64_t
ml_readprop(const DTEntry entry,const char * propertyName,uint64_t default_value)1010 ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
1011 {
1012 	void const *prop;
1013 	unsigned int propSize;
1014 
1015 	if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) {
1016 		if (propSize == sizeof(uint8_t)) {
1017 			return *((uint8_t const *)prop);
1018 		} else if (propSize == sizeof(uint16_t)) {
1019 			return *((uint16_t const *)prop);
1020 		} else if (propSize == sizeof(uint32_t)) {
1021 			return *((uint32_t const *)prop);
1022 		} else if (propSize == sizeof(uint64_t)) {
1023 			return *((uint64_t const *)prop);
1024 		} else {
1025 			panic("CPU property '%s' has bad size %u", propertyName, propSize);
1026 		}
1027 	} else {
1028 		if (default_value == ML_READPROP_MANDATORY) {
1029 			panic("Missing mandatory property '%s'", propertyName);
1030 		}
1031 		return default_value;
1032 	}
1033 }
1034 
1035 static boolean_t
ml_read_reg_range(const DTEntry entry,const char * propertyName,uint64_t * pa_ptr,uint64_t * len_ptr)1036 ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
1037 {
1038 	uint64_t const *prop;
1039 	unsigned int propSize;
1040 
1041 	if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) {
1042 		return FALSE;
1043 	}
1044 
1045 	if (propSize != sizeof(uint64_t) * 2) {
1046 		panic("Wrong property size for %s", propertyName);
1047 	}
1048 
1049 	*pa_ptr = prop[0];
1050 	*len_ptr = prop[1];
1051 	return TRUE;
1052 }
1053 
1054 static boolean_t
ml_is_boot_cpu(const DTEntry entry)1055 ml_is_boot_cpu(const DTEntry entry)
1056 {
1057 	void const *prop;
1058 	unsigned int propSize;
1059 
1060 	if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) {
1061 		panic("unable to retrieve state for cpu");
1062 	}
1063 
1064 	if (strncmp((char const *)prop, "running", propSize) == 0) {
1065 		return TRUE;
1066 	} else {
1067 		return FALSE;
1068 	}
1069 }
1070 
1071 static void
ml_cluster_power_override(unsigned int * flag)1072 ml_cluster_power_override(unsigned int *flag)
1073 {
1074 #if XNU_CLUSTER_POWER_DOWN
1075 	/*
1076 	 * Old method (H14/H15): enable CPD in the kernel build
1077 	 * For H16+, *flag may have be set to 1 through EDT
1078 	 */
1079 	*flag = 1;
1080 #endif
1081 
1082 	/*
1083 	 * If a boot-arg is set that allows threads to be bound
1084 	 * to a cpu or cluster, cluster_power_down must
1085 	 * default to false.
1086 	 */
1087 #ifdef CONFIG_XNUPOST
1088 	uint64_t kernel_post = 0;
1089 	PE_parse_boot_argn("kernPOST", &kernel_post, sizeof(kernel_post));
1090 	if (kernel_post != 0) {
1091 		*flag = 0;
1092 	}
1093 #endif
1094 	if (PE_parse_boot_argn("enable_skstb", NULL, 0)) {
1095 		*flag = 0;
1096 	}
1097 	if (PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
1098 		*flag = 0;
1099 	}
1100 
1101 	/* Always let the user manually override, even if it's unsupported */
1102 	PE_parse_boot_argn("cluster_power", flag, sizeof(*flag));
1103 }
1104 
1105 static void
ml_read_chip_revision(unsigned int * rev __unused)1106 ml_read_chip_revision(unsigned int *rev __unused)
1107 {
1108 	// The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
1109 #ifdef APPLE_ARM64_ARCH_FAMILY
1110 	DTEntry         entryP;
1111 
1112 	if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) {
1113 		*rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN);
1114 	} else {
1115 		*rev = CPU_VERSION_UNKNOWN;
1116 	}
1117 #endif
1118 }
1119 
1120 void
ml_parse_cpu_topology(void)1121 ml_parse_cpu_topology(void)
1122 {
1123 	DTEntry entry, child __unused;
1124 	OpaqueDTEntryIterator iter;
1125 	uint32_t cpu_boot_arg = MAX_CPUS;
1126 	uint64_t cpumask_boot_arg = ULLONG_MAX;
1127 	int err;
1128 
1129 	int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
1130 	int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
1131 	const boolean_t cpus_boot_arg_present = PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg));
1132 	const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn("cpumask", &cpumask_boot_arg, sizeof(cpumask_boot_arg));
1133 
1134 	// The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
1135 	// so that we trigger a panic later in the boot process, once serial is enabled.
1136 	if (cpus_boot_arg_present && cpumask_boot_arg_present) {
1137 		cpu_config_correct = false;
1138 	}
1139 
1140 	err = SecureDTLookupEntry(NULL, "/cpus", &entry);
1141 	assert(err == kSuccess);
1142 
1143 	err = SecureDTInitEntryIterator(entry, &iter);
1144 	assert(err == kSuccess);
1145 
1146 	for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1147 		cluster_offsets[i] = -1;
1148 		cluster_phys_to_logical[i] = -1;
1149 		cluster_max_cpu_phys_id[i] = 0;
1150 	}
1151 
1152 	while (kSuccess == SecureDTIterateEntries(&iter, &child)) {
1153 		boolean_t is_boot_cpu = ml_is_boot_cpu(child);
1154 		boolean_t cpu_enabled = cpumask_boot_arg & 1;
1155 		cpumask_boot_arg >>= 1;
1156 
1157 		// Boot CPU disabled in cpumask. Flag this so that we trigger a panic
1158 		// later in the boot process, once serial is enabled.
1159 		if (is_boot_cpu && !cpu_enabled) {
1160 			cpu_config_correct = false;
1161 		}
1162 
1163 		// Ignore this CPU if it has been disabled by the cpumask= boot-arg.
1164 		if (!is_boot_cpu && !cpu_enabled) {
1165 			continue;
1166 		}
1167 
1168 		// If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
1169 		// been added to the topology struct yet, and we only have one slot left, then skip
1170 		// every other non-boot CPU in order to leave room for the boot CPU.
1171 		//
1172 		// e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
1173 		// array will list CPU0, CPU1, and CPU4.  CPU2-CPU3 and CPU5-CPUn will be omitted.
1174 		if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
1175 			continue;
1176 		}
1177 		if (topology_info.num_cpus >= cpu_boot_arg) {
1178 			break;
1179 		}
1180 
1181 		ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
1182 
1183 		cpu->cpu_id = topology_info.num_cpus++;
1184 		assert(cpu->cpu_id < MAX_CPUS);
1185 		topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
1186 
1187 		cpu->die_id = (int)ml_readprop(child, "die-id", 0);
1188 		topology_info.max_die_id = MAX(topology_info.max_die_id, cpu->die_id);
1189 
1190 		cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY);
1191 
1192 		cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0);
1193 		cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0);
1194 		cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
1195 		cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
1196 
1197 		ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
1198 		ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
1199 		ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
1200 		cpu->cluster_type = CLUSTER_TYPE_SMP;
1201 
1202 		int cluster_type = (int)ml_readprop(child, "cluster-type", 0);
1203 		if (cluster_type == 'E') {
1204 			cpu->cluster_type = CLUSTER_TYPE_E;
1205 		} else if (cluster_type == 'P') {
1206 			cpu->cluster_type = CLUSTER_TYPE_P;
1207 		}
1208 
1209 		if (ml_readprop(child, "cluster-power-down", 0)) {
1210 			topology_info.cluster_power_down = 1;
1211 		}
1212 
1213 		topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1214 
1215 		/*
1216 		 * Since we want to keep a linear cluster ID space, we cannot just rely
1217 		 * on the value provided by EDT. Instead, use the MPIDR value to see if we have
1218 		 * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1219 		 */
1220 #if HAS_CLUSTER
1221 		uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1222 #else
1223 		uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1224 #endif
1225 		assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1226 		cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
1227 		    topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1228 
1229 		assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1230 
1231 		ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1232 		if (cluster->num_cpus == 0) {
1233 			assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1234 
1235 			topology_info.num_clusters++;
1236 			topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1237 			topology_info.cluster_types |= (1 << cpu->cluster_type);
1238 
1239 			cluster->cluster_id = cpu->cluster_id;
1240 			cluster->die_id = cpu->die_id;
1241 			cluster->cluster_type = cpu->cluster_type;
1242 			cluster->first_cpu_id = cpu->cpu_id;
1243 			assert(cluster_phys_to_logical[phys_cluster_id] == -1);
1244 			cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1245 
1246 			topology_info.cluster_type_num_clusters[cluster->cluster_type]++;
1247 
1248 			// Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1249 			// If we wind up with a bunch of these, we might want to create separate per-cluster
1250 			// EDT nodes and have the CPU nodes reference them through a phandle.
1251 			ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len);
1252 			ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len);
1253 		}
1254 
1255 #if HAS_CLUSTER
1256 		if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1257 			cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1258 		}
1259 #endif
1260 
1261 		cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1262 		cluster->die_cluster_id = cpu->die_cluster_id;
1263 
1264 		cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1265 
1266 		cluster->num_cpus++;
1267 		cluster->cpu_mask |= 1ULL << cpu->cpu_id;
1268 
1269 		if (is_boot_cpu) {
1270 			assert(topology_info.boot_cpu == NULL);
1271 			topology_info.boot_cpu = cpu;
1272 			topology_info.boot_cluster = cluster;
1273 		}
1274 
1275 #if CONFIG_SPTM
1276 		sptm_register_cpu(cpu->phys_id);
1277 #endif
1278 	}
1279 
1280 #if HAS_CLUSTER
1281 	/*
1282 	 * Build the cluster offset array, ensuring that the region reserved
1283 	 * for each physical cluster contains enough entries to be indexed
1284 	 * by the maximum physical CPU ID (AFF0) within the cluster.
1285 	 */
1286 	unsigned int cur_cluster_offset = 0;
1287 	for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1288 		if (cluster_phys_to_logical[i] != -1) {
1289 			cluster_offsets[i] = cur_cluster_offset;
1290 			cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
1291 		}
1292 	}
1293 	assert(cur_cluster_offset <= MAX_CPUS);
1294 #else
1295 	/*
1296 	 * For H10, there are really 2 physical clusters, but they are not separated
1297 	 * into distinct ACCs.  AFF1 therefore always reports 0, and AFF0 numbering
1298 	 * is linear across both clusters.   For the purpose of MPIDR_EL1-based indexing,
1299 	 * treat H10 and earlier devices as though they contain a single cluster.
1300 	 */
1301 	cluster_offsets[0] = 0;
1302 #endif
1303 	assert(topology_info.boot_cpu != NULL);
1304 	ml_read_chip_revision(&topology_info.chip_revision);
1305 	ml_cluster_power_override(&topology_info.cluster_power_down);
1306 
1307 	/*
1308 	 * Set TPIDR_EL0 to indicate the correct cpu number & cluster id,
1309 	 * as we may not be booting from cpu 0. Userspace will consume
1310 	 * the current CPU number through this register. For non-boot
1311 	 * cores, this is done in start.s (start_cpu) using the per-cpu
1312 	 * data object.
1313 	 */
1314 	ml_topology_cpu_t *boot_cpu = topology_info.boot_cpu;
1315 	uint64_t tpidr_el0 = ((boot_cpu->cpu_id << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) | \
1316 	    ((boot_cpu->cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1317 	assert(((tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == boot_cpu->cpu_id);
1318 	assert(((tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == boot_cpu->cluster_id);
1319 	__builtin_arm_wsr64("TPIDR_EL0", tpidr_el0);
1320 
1321 	__builtin_arm_wsr64("TPIDRRO_EL0", 0);
1322 }
1323 
1324 const ml_topology_info_t *
ml_get_topology_info(void)1325 ml_get_topology_info(void)
1326 {
1327 	return &topology_info;
1328 }
1329 
1330 void
ml_map_cpu_pio(void)1331 ml_map_cpu_pio(void)
1332 {
1333 	unsigned int i;
1334 
1335 	for (i = 0; i < topology_info.num_cpus; i++) {
1336 		ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1337 		if (cpu->cpu_IMPL_pa) {
1338 			cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len);
1339 			cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len);
1340 		}
1341 		if (cpu->cpu_UTTDBG_pa) {
1342 			cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len);
1343 		}
1344 	}
1345 
1346 	for (i = 0; i < topology_info.num_clusters; i++) {
1347 		ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1348 		if (cluster->acc_IMPL_pa) {
1349 			cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len);
1350 		}
1351 		if (cluster->cpm_IMPL_pa) {
1352 			cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len);
1353 		}
1354 	}
1355 }
1356 
1357 unsigned int
ml_get_cpu_count(void)1358 ml_get_cpu_count(void)
1359 {
1360 	return topology_info.num_cpus;
1361 }
1362 
1363 unsigned int
ml_get_cluster_count(void)1364 ml_get_cluster_count(void)
1365 {
1366 	return topology_info.num_clusters;
1367 }
1368 
1369 int
ml_get_boot_cpu_number(void)1370 ml_get_boot_cpu_number(void)
1371 {
1372 	return topology_info.boot_cpu->cpu_id;
1373 }
1374 
1375 cluster_type_t
ml_get_boot_cluster_type(void)1376 ml_get_boot_cluster_type(void)
1377 {
1378 	return topology_info.boot_cluster->cluster_type;
1379 }
1380 
1381 int
ml_get_cpu_number(uint32_t phys_id)1382 ml_get_cpu_number(uint32_t phys_id)
1383 {
1384 	phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
1385 
1386 	for (unsigned i = 0; i < topology_info.num_cpus; i++) {
1387 		if (topology_info.cpus[i].phys_id == phys_id) {
1388 			return i;
1389 		}
1390 	}
1391 
1392 	return -1;
1393 }
1394 
1395 int
ml_get_cluster_number(uint32_t phys_id)1396 ml_get_cluster_number(uint32_t phys_id)
1397 {
1398 	int cpu_id = ml_get_cpu_number(phys_id);
1399 	if (cpu_id < 0) {
1400 		return -1;
1401 	}
1402 
1403 	ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1404 
1405 	return cpu->cluster_id;
1406 }
1407 
1408 unsigned int
ml_get_cpu_number_local(void)1409 ml_get_cpu_number_local(void)
1410 {
1411 	uint64_t mpidr_el1_value = 0;
1412 	unsigned cpu_id;
1413 
1414 	/* We identify the CPU based on the constant bits of MPIDR_EL1. */
1415 	MRS(mpidr_el1_value, "MPIDR_EL1");
1416 	cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value);
1417 
1418 	assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1419 
1420 	return cpu_id;
1421 }
1422 
1423 int
ml_get_cluster_number_local()1424 ml_get_cluster_number_local()
1425 {
1426 	uint64_t mpidr_el1_value = 0;
1427 	unsigned cluster_id;
1428 
1429 	/* We identify the cluster based on the constant bits of MPIDR_EL1. */
1430 	MRS(mpidr_el1_value, "MPIDR_EL1");
1431 	cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value);
1432 
1433 	assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1434 
1435 	return cluster_id;
1436 }
1437 
1438 int
ml_get_max_cpu_number(void)1439 ml_get_max_cpu_number(void)
1440 {
1441 	return topology_info.max_cpu_id;
1442 }
1443 
1444 int
ml_get_max_cluster_number(void)1445 ml_get_max_cluster_number(void)
1446 {
1447 	return topology_info.max_cluster_id;
1448 }
1449 
1450 unsigned int
ml_get_first_cpu_id(unsigned int cluster_id)1451 ml_get_first_cpu_id(unsigned int cluster_id)
1452 {
1453 	return topology_info.clusters[cluster_id].first_cpu_id;
1454 }
1455 
1456 static_assert(MAX_CPUS <= 256, "MAX_CPUS must fit in _COMM_PAGE_CPU_TO_CLUSTER; Increase table size if needed");
1457 
1458 void
ml_map_cpus_to_clusters(uint8_t * table)1459 ml_map_cpus_to_clusters(uint8_t *table)
1460 {
1461 	for (uint16_t cpu_id = 0; cpu_id < topology_info.num_cpus; cpu_id++) {
1462 		*(table + cpu_id) = (uint8_t)(topology_info.cpus[cpu_id].cluster_id);
1463 	}
1464 }
1465 
1466 /*
1467  * Return the die id of a cluster.
1468  */
1469 unsigned int
ml_get_die_id(unsigned int cluster_id)1470 ml_get_die_id(unsigned int cluster_id)
1471 {
1472 	/*
1473 	 * The current implementation gets the die_id from the
1474 	 * first CPU of the cluster.
1475 	 * rdar://80917654 (Add the die_id field to the cluster topology info)
1476 	 */
1477 	unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1478 	return topology_info.cpus[first_cpu].die_id;
1479 }
1480 
1481 /*
1482  * Return the index of a cluster in its die.
1483  */
1484 unsigned int
ml_get_die_cluster_id(unsigned int cluster_id)1485 ml_get_die_cluster_id(unsigned int cluster_id)
1486 {
1487 	/*
1488 	 * The current implementation gets the die_id from the
1489 	 * first CPU of the cluster.
1490 	 * rdar://80917654 (Add the die_id field to the cluster topology info)
1491 	 */
1492 	unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1493 	return topology_info.cpus[first_cpu].die_cluster_id;
1494 }
1495 
1496 /*
1497  * Return the highest die id of the system.
1498  */
1499 unsigned int
ml_get_max_die_id(void)1500 ml_get_max_die_id(void)
1501 {
1502 	return topology_info.max_die_id;
1503 }
1504 
1505 void
ml_lockdown_init()1506 ml_lockdown_init()
1507 {
1508 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
1509 	rorgn_stash_range();
1510 #endif
1511 }
1512 
1513 kern_return_t
ml_lockdown_handler_register(lockdown_handler_t f,void * this)1514 ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1515 {
1516 	if (!f) {
1517 		return KERN_FAILURE;
1518 	}
1519 
1520 	assert(lockdown_done);
1521 	f(this); // XXX: f this whole function
1522 
1523 	return KERN_SUCCESS;
1524 }
1525 
1526 static mcache_flush_function mcache_flush_func;
1527 static void* mcache_flush_service;
1528 kern_return_t
ml_mcache_flush_callback_register(mcache_flush_function func,void * service)1529 ml_mcache_flush_callback_register(mcache_flush_function func, void *service)
1530 {
1531 	mcache_flush_service = service;
1532 	mcache_flush_func = func;
1533 
1534 	return KERN_SUCCESS;
1535 }
1536 
1537 kern_return_t
ml_mcache_flush(void)1538 ml_mcache_flush(void)
1539 {
1540 	if (!mcache_flush_func) {
1541 		panic("Cannot flush M$ with no flush callback registered");
1542 
1543 		return KERN_FAILURE;
1544 	} else {
1545 		return mcache_flush_func(mcache_flush_service);
1546 	}
1547 }
1548 
1549 
1550 kern_return_t ml_mem_fault_report_enable_register(void);
1551 kern_return_t
ml_mem_fault_report_enable_register(void)1552 ml_mem_fault_report_enable_register(void)
1553 {
1554 	return KERN_SUCCESS;
1555 }
1556 
1557 kern_return_t ml_amcc_error_inject_register(void);
1558 kern_return_t
ml_amcc_error_inject_register(void)1559 ml_amcc_error_inject_register(void)
1560 {
1561 	return KERN_SUCCESS;
1562 }
1563 
1564 kern_return_t ml_dcs_error_inject_register(void);
1565 kern_return_t
ml_dcs_error_inject_register(void)1566 ml_dcs_error_inject_register(void)
1567 {
1568 	return KERN_SUCCESS;
1569 }
1570 
1571 
1572 extern lck_mtx_t pset_create_lock;
1573 
1574 kern_return_t
ml_processor_register(ml_processor_info_t * in_processor_info,processor_t * processor_out,ipi_handler_t * ipi_handler_out,perfmon_interrupt_handler_func * pmi_handler_out)1575 ml_processor_register(ml_processor_info_t *in_processor_info,
1576     processor_t *processor_out, ipi_handler_t *ipi_handler_out,
1577     perfmon_interrupt_handler_func *pmi_handler_out)
1578 {
1579 	cpu_data_t *this_cpu_datap;
1580 	processor_set_t pset;
1581 	boolean_t  is_boot_cpu;
1582 	static unsigned int reg_cpu_count = 0;
1583 
1584 	if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1585 		return KERN_FAILURE;
1586 	}
1587 
1588 	if ((unsigned)OSIncrementAtomic((SInt32*)&reg_cpu_count) >= topology_info.num_cpus) {
1589 		return KERN_FAILURE;
1590 	}
1591 
1592 	if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1593 		is_boot_cpu = FALSE;
1594 		this_cpu_datap = cpu_data_alloc(FALSE);
1595 		cpu_data_init(this_cpu_datap);
1596 	} else {
1597 		this_cpu_datap = &BootCpuData;
1598 		is_boot_cpu = TRUE;
1599 		/*
1600 		 * Note that ml_processor_register happens for the boot cpu
1601 		 * *after* it starts running arbitrary threads, possibly
1602 		 * including *userspace*, depending on how long the CPU
1603 		 * services take to match.
1604 		 */
1605 	}
1606 
1607 	assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1608 
1609 	this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1610 
1611 	if (!is_boot_cpu) {
1612 		this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1613 		cpu_data_register(this_cpu_datap);
1614 		assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1615 	}
1616 
1617 	this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1618 	this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1619 	nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency);
1620 	this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr);
1621 
1622 	this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1623 	this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1624 
1625 	this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1626 	this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1627 	this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1628 
1629 	this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1630 	this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1631 	this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1632 	this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1633 	this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1634 	this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1635 
1636 	/*
1637 	 * Encode cpu_id, cluster_id to be stored in TPIDR_EL0 (see
1638 	 * cswitch.s:set_thread_registers, start.s:start_cpu) for consumption
1639 	 * by userspace.
1640 	 */
1641 	this_cpu_datap->cpu_tpidr_el0 = ((this_cpu_datap->cpu_number << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) | \
1642 	    ((this_cpu_datap->cpu_cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1643 	assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == this_cpu_datap->cpu_number);
1644 	assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == this_cpu_datap->cpu_cluster_id);
1645 
1646 #if HAS_CLUSTER
1647 	this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1648 #else /* HAS_CLUSTER */
1649 	this_cpu_datap->cluster_master = is_boot_cpu;
1650 #endif /* HAS_CLUSTER */
1651 	lck_mtx_lock(&pset_create_lock);
1652 	pset = pset_find(in_processor_info->cluster_id, NULL);
1653 	kprintf("[%d]%s>pset_find(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cluster_id, pset ? pset->pset_id : -1);
1654 	if (pset == NULL) {
1655 		pset_cluster_type_t pset_cluster_type = cluster_type_to_pset_cluster_type(this_cpu_datap->cpu_cluster_type);
1656 		pset_node_t pset_node = cluster_type_to_pset_node(this_cpu_datap->cpu_cluster_type);
1657 		pset = pset_create(pset_node, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1658 		assert(pset != PROCESSOR_SET_NULL);
1659 #if __AMP__
1660 		kprintf("[%d]%s>pset_create(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, this_cpu_datap->cpu_cluster_id, pset->pset_id);
1661 #endif /* __AMP__ */
1662 	}
1663 	kprintf("[%d]%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1664 	lck_mtx_unlock(&pset_create_lock);
1665 
1666 	processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1667 	if (!is_boot_cpu) {
1668 		processor_init(processor, this_cpu_datap->cpu_number, pset);
1669 	}
1670 
1671 	*processor_out = processor;
1672 	*ipi_handler_out = cpu_signal_handler;
1673 #if CPMU_AIC_PMI && CONFIG_CPU_COUNTERS
1674 	*pmi_handler_out = mt_cpmu_aic_pmi;
1675 #else
1676 	*pmi_handler_out = NULL;
1677 #endif /* CPMU_AIC_PMI && CONFIG_CPU_COUNTERS */
1678 	if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1679 		*in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1680 	}
1681 
1682 #if CONFIG_CPU_COUNTERS
1683 	kpc_register_cpu(this_cpu_datap);
1684 #endif /* CONFIG_CPU_COUNTERS */
1685 
1686 #ifdef APPLEEVEREST
1687 	/**
1688 	 * H15 SoCs have PIO lockdown applied at early boot for secondary CPUs.
1689 	 * Save PIO lock base addreses.
1690 	 */
1691 	const uint32_t log_id = in_processor_info->log_id;
1692 	const unsigned int cluster_id = topology_info.cpus[log_id].cluster_id;
1693 	this_cpu_datap->cpu_reg_paddr = topology_info.cpus[log_id].cpu_IMPL_pa;
1694 	this_cpu_datap->acc_reg_paddr = topology_info.clusters[cluster_id].acc_IMPL_pa;
1695 	this_cpu_datap->cpm_reg_paddr = topology_info.clusters[cluster_id].cpm_IMPL_pa;
1696 #endif
1697 
1698 
1699 	if (!is_boot_cpu) {
1700 		random_cpu_init(this_cpu_datap->cpu_number);
1701 		// now let next CPU register itself
1702 		OSIncrementAtomic((SInt32*)&real_ncpus);
1703 	}
1704 
1705 	os_atomic_or(&this_cpu_datap->cpu_flags, InitState, relaxed);
1706 
1707 #if !USE_APPLEARMSMP
1708 	/*
1709 	 * AppleARMCPU's external processor_start call is now a no-op, so
1710 	 * boot the processor directly when it's registered.
1711 	 *
1712 	 * It needs to be booted here for the boot processor to finish the
1713 	 * subsequent registerInterrupt operations and unblock the other cores.
1714 	 */
1715 	processor_boot(processor);
1716 #endif /* !USE_APPLEARMSMP */
1717 
1718 	return KERN_SUCCESS;
1719 }
1720 
1721 void
ml_init_arm_debug_interface(void * in_cpu_datap,vm_offset_t virt_address)1722 ml_init_arm_debug_interface(
1723 	void * in_cpu_datap,
1724 	vm_offset_t virt_address)
1725 {
1726 	((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1727 	do_debugid();
1728 }
1729 
1730 /*
1731  *	Routine:        init_ast_check
1732  *	Function:
1733  */
1734 void
init_ast_check(__unused processor_t processor)1735 init_ast_check(
1736 	__unused processor_t processor)
1737 {
1738 }
1739 
1740 /*
1741  *	Routine:        cause_ast_check
1742  *	Function:
1743  */
1744 void
cause_ast_check(processor_t processor)1745 cause_ast_check(
1746 	processor_t processor)
1747 {
1748 	assert(processor != PROCESSOR_NULL);
1749 
1750 	if (current_processor() != processor) {
1751 		cpu_signal(processor_to_cpu_datap(processor), SIGPast, (void *)NULL, (void *)NULL);
1752 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 1 /* ast */, 0, 0, 0);
1753 	}
1754 }
1755 
1756 extern uint32_t cpu_idle_count;
1757 
1758 void
ml_get_power_state(boolean_t * icp,boolean_t * pidlep)1759 ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
1760 {
1761 	*icp = ml_at_interrupt_context();
1762 	*pidlep = (cpu_idle_count == real_ncpus);
1763 }
1764 
1765 /*
1766  *	Routine:        ml_cause_interrupt
1767  *	Function:	Generate a fake interrupt
1768  */
1769 void
ml_cause_interrupt(void)1770 ml_cause_interrupt(void)
1771 {
1772 	return;                 /* BS_XXX */
1773 }
1774 
1775 /* Map memory map IO space */
1776 vm_offset_t
ml_io_map(vm_offset_t phys_addr,vm_size_t size)1777 ml_io_map(
1778 	vm_offset_t phys_addr,
1779 	vm_size_t size)
1780 {
1781 	return io_map(phys_addr, size, VM_WIMG_IO, VM_PROT_DEFAULT, false);
1782 }
1783 
1784 /* Map memory map IO space (with protections specified) */
1785 vm_offset_t
ml_io_map_with_prot(vm_offset_t phys_addr,vm_size_t size,vm_prot_t prot)1786 ml_io_map_with_prot(
1787 	vm_offset_t phys_addr,
1788 	vm_size_t size,
1789 	vm_prot_t prot)
1790 {
1791 	return io_map(phys_addr, size, VM_WIMG_IO, prot, false);
1792 }
1793 
1794 vm_offset_t
ml_io_map_unmappable(vm_offset_t phys_addr,vm_size_t size,unsigned int flags)1795 ml_io_map_unmappable(
1796 	vm_offset_t             phys_addr,
1797 	vm_size_t               size,
1798 	unsigned int            flags)
1799 {
1800 	return io_map(phys_addr, size, flags, VM_PROT_DEFAULT, true);
1801 }
1802 
1803 vm_offset_t
ml_io_map_wcomb(vm_offset_t phys_addr,vm_size_t size)1804 ml_io_map_wcomb(
1805 	vm_offset_t phys_addr,
1806 	vm_size_t size)
1807 {
1808 	return io_map(phys_addr, size, VM_WIMG_WCOMB, VM_PROT_DEFAULT, false);
1809 }
1810 
1811 void
ml_io_unmap(vm_offset_t addr,vm_size_t sz)1812 ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1813 {
1814 	pmap_remove(kernel_pmap, addr, addr + sz);
1815 	kmem_free(kernel_map, addr, sz);
1816 }
1817 
1818 vm_map_address_t
ml_map_high_window(vm_offset_t phys_addr,vm_size_t len)1819 ml_map_high_window(
1820 	vm_offset_t     phys_addr,
1821 	vm_size_t       len)
1822 {
1823 	return pmap_map_high_window_bd(phys_addr, len, VM_PROT_READ | VM_PROT_WRITE);
1824 }
1825 
1826 vm_offset_t
ml_static_ptovirt(vm_offset_t paddr)1827 ml_static_ptovirt(
1828 	vm_offset_t paddr)
1829 {
1830 	return phystokv(paddr);
1831 }
1832 
1833 vm_offset_t
ml_static_slide(vm_offset_t vaddr)1834 ml_static_slide(
1835 	vm_offset_t vaddr)
1836 {
1837 	vm_offset_t slid_vaddr = 0;
1838 
1839 #if CONFIG_SPTM
1840 	if ((vaddr >= vm_sptm_offsets.unslid_base) && (vaddr < vm_sptm_offsets.unslid_top)) {
1841 		slid_vaddr = vaddr + vm_sptm_offsets.slide;
1842 	} else if ((vaddr >= vm_txm_offsets.unslid_base) && (vaddr < vm_txm_offsets.unslid_top)) {
1843 		slid_vaddr = vaddr + vm_txm_offsets.slide;
1844 	} else
1845 #endif /* CONFIG_SPTM */
1846 	{
1847 		slid_vaddr = vaddr + vm_kernel_slide;
1848 	}
1849 
1850 	if (!VM_KERNEL_IS_SLID(slid_vaddr)) {
1851 		/* This is only intended for use on static kernel addresses. */
1852 		return 0;
1853 	}
1854 
1855 	return slid_vaddr;
1856 }
1857 
1858 vm_offset_t
ml_static_unslide(vm_offset_t vaddr)1859 ml_static_unslide(
1860 	vm_offset_t vaddr)
1861 {
1862 	if (!VM_KERNEL_IS_SLID(vaddr)) {
1863 		/* This is only intended for use on static kernel addresses. */
1864 		return 0;
1865 	}
1866 
1867 #if CONFIG_SPTM
1868 	/**
1869 	 * Addresses coming from the SPTM and TXM have a different slide than the
1870 	 * rest of the kernel.
1871 	 */
1872 	if ((vaddr >= vm_sptm_offsets.slid_base) && (vaddr < vm_sptm_offsets.slid_top)) {
1873 		return vaddr - vm_sptm_offsets.slide;
1874 	}
1875 
1876 	if ((vaddr >= vm_txm_offsets.slid_base) && (vaddr < vm_txm_offsets.slid_top)) {
1877 		return vaddr - vm_txm_offsets.slide;
1878 	}
1879 #endif /* CONFIG_SPTM */
1880 
1881 	return vaddr - vm_kernel_slide;
1882 }
1883 
1884 extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1885 
1886 kern_return_t
ml_static_protect(vm_offset_t vaddr,vm_size_t size,vm_prot_t new_prot __unused)1887 ml_static_protect(
1888 	vm_offset_t vaddr, /* kernel virtual address */
1889 	vm_size_t size,
1890 	vm_prot_t new_prot __unused)
1891 {
1892 #if CONFIG_SPTM
1893 	/**
1894 	 * Retype any frames that may be passed to the VM to XNU_DEFAULT.
1895 	 */
1896 	for (vm_offset_t sptm_vaddr_cur = vaddr; sptm_vaddr_cur < trunc_page_64(vaddr + size); sptm_vaddr_cur += PAGE_SIZE) {
1897 		/* Check if this frame is XNU_DEFAULT and only retype it if is not */
1898 		sptm_paddr_t sptm_paddr_cur = kvtophys_nofail(sptm_vaddr_cur);
1899 		sptm_frame_type_t current_type = sptm_get_frame_type(sptm_paddr_cur);
1900 		if (current_type != XNU_DEFAULT) {
1901 			sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1902 			sptm_retype(sptm_paddr_cur, current_type, XNU_DEFAULT, retype_params);
1903 		}
1904 	}
1905 
1906 	return KERN_SUCCESS;
1907 #else /* CONFIG_SPTM */
1908 	pt_entry_t    arm_prot = 0;
1909 	pt_entry_t    arm_block_prot = 0;
1910 	vm_offset_t   vaddr_cur;
1911 	ppnum_t       ppn;
1912 	kern_return_t result = KERN_SUCCESS;
1913 
1914 	if (vaddr < physmap_base) {
1915 		panic("ml_static_protect(): %p < %p", (void *) vaddr, (void *) physmap_base);
1916 		return KERN_FAILURE;
1917 	}
1918 
1919 	assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1920 
1921 	if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1922 		panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1923 	}
1924 	if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1925 		panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1926 	}
1927 
1928 	/* Set up the protection bits, and block bits so we can validate block mappings. */
1929 	if (new_prot & VM_PROT_WRITE) {
1930 		arm_prot |= ARM_PTE_AP(AP_RWNA);
1931 		arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RWNA);
1932 	} else {
1933 		arm_prot |= ARM_PTE_AP(AP_RONA);
1934 		arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RONA);
1935 	}
1936 
1937 	arm_prot |= ARM_PTE_NX;
1938 	arm_block_prot |= ARM_TTE_BLOCK_NX;
1939 
1940 	if (!(new_prot & VM_PROT_EXECUTE)) {
1941 		arm_prot |= ARM_PTE_PNX;
1942 		arm_block_prot |= ARM_TTE_BLOCK_PNX;
1943 	}
1944 
1945 	for (vaddr_cur = vaddr;
1946 	    vaddr_cur < trunc_page_64(vaddr + size);
1947 	    vaddr_cur += PAGE_SIZE) {
1948 		ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1949 		if (ppn != (vm_offset_t) NULL) {
1950 			tt_entry_t      *tte2;
1951 			pt_entry_t      *pte_p;
1952 			pt_entry_t      ptmp;
1953 
1954 #if XNU_MONITOR
1955 			assert(!pmap_is_monitor(ppn));
1956 			assert(!TEST_PAGE_RATIO_4);
1957 #endif
1958 
1959 			tte2 = arm_kva_to_tte(vaddr_cur);
1960 
1961 			if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1962 				if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1963 				    ((*tte2 & (ARM_TTE_BLOCK_NXMASK | ARM_TTE_BLOCK_PNXMASK | ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1964 					/*
1965 					 * We can support ml_static_protect on a block mapping if the mapping already has
1966 					 * the desired protections.  We still want to run checks on a per-page basis.
1967 					 */
1968 					continue;
1969 				}
1970 
1971 				result = KERN_FAILURE;
1972 				break;
1973 			}
1974 
1975 			pte_p = (pt_entry_t *)&((tt_entry_t*)(phystokv((*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1976 			ptmp = *pte_p;
1977 
1978 			if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot)) {
1979 				/*
1980 				 * The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
1981 				 * protections do not match the desired protections, then we will fail (as we cannot update
1982 				 * this mapping without updating other mappings as well).
1983 				 */
1984 				result = KERN_FAILURE;
1985 				break;
1986 			}
1987 
1988 			__unreachable_ok_push
1989 			if (TEST_PAGE_RATIO_4) {
1990 				{
1991 					unsigned int    i;
1992 					pt_entry_t      *ptep_iter;
1993 
1994 					ptep_iter = pte_p;
1995 					for (i = 0; i < 4; i++, ptep_iter++) {
1996 						/* Note that there is a hole in the HINT sanity checking here. */
1997 						ptmp = *ptep_iter;
1998 
1999 						/* We only need to update the page tables if the protections do not match. */
2000 						if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
2001 							ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
2002 							*ptep_iter = ptmp;
2003 						}
2004 					}
2005 				}
2006 			} else {
2007 				ptmp = *pte_p;
2008 				/* We only need to update the page tables if the protections do not match. */
2009 				if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
2010 					ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
2011 					*pte_p = ptmp;
2012 				}
2013 			}
2014 			__unreachable_ok_pop
2015 		}
2016 	}
2017 
2018 	if (vaddr_cur > vaddr) {
2019 		assert(((vaddr_cur - vaddr) & 0xFFFFFFFF00000000ULL) == 0);
2020 		flush_mmu_tlb_region(vaddr, (uint32_t)(vaddr_cur - vaddr));
2021 	}
2022 
2023 
2024 	return result;
2025 #endif /* CONFIG_SPTM */
2026 }
2027 
2028 #if defined(CONFIG_SPTM)
2029 /*
2030  * Returns true if the given physical address is in one of the boot kernelcache ranges.
2031  */
2032 static bool
ml_physaddr_in_bootkc_range(vm_offset_t physaddr)2033 ml_physaddr_in_bootkc_range(vm_offset_t physaddr)
2034 {
2035 	for (int i = 0; i < arm_vm_kernelcache_numranges; i++) {
2036 		if (physaddr >= arm_vm_kernelcache_ranges[i].start_phys && physaddr < arm_vm_kernelcache_ranges[i].end_phys) {
2037 			return true;
2038 		}
2039 	}
2040 	return false;
2041 }
2042 #endif /* defined(CONFIG_SPTM) */
2043 
2044 /*
2045  *	Routine:        ml_static_mfree
2046  *	Function:
2047  */
2048 void
ml_static_mfree(vm_offset_t vaddr,vm_size_t size)2049 ml_static_mfree(
2050 	vm_offset_t vaddr,
2051 	vm_size_t   size)
2052 {
2053 	vm_offset_t vaddr_cur;
2054 	vm_offset_t paddr_cur;
2055 	ppnum_t     ppn;
2056 	uint32_t    freed_pages = 0;
2057 	uint32_t    freed_kernelcache_pages = 0;
2058 
2059 
2060 	/* It is acceptable (if bad) to fail to free. */
2061 	if (vaddr < physmap_base) {
2062 		return;
2063 	}
2064 
2065 	assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
2066 
2067 	for (vaddr_cur = vaddr;
2068 	    vaddr_cur < trunc_page_64(vaddr + size);
2069 	    vaddr_cur += PAGE_SIZE) {
2070 		/*
2071 		 * Some clients invoke ml_static_mfree on non-physical aperture
2072 		 * addresses.  To support this, we convert the virtual address
2073 		 * to a physical aperture address, and remove all mappings of
2074 		 * the page as we update the physical aperture protections.
2075 		 */
2076 		vm_offset_t vaddr_papt = phystokv(kvtophys(vaddr_cur));
2077 		ppn = pmap_find_phys(kernel_pmap, vaddr_papt);
2078 
2079 		if (ppn != (vm_offset_t) NULL) {
2080 			/*
2081 			 * It is not acceptable to fail to update the protections on a page
2082 			 * we will release to the VM.  We need to either panic or continue.
2083 			 * For now, we'll panic (to help flag if there is memory we can
2084 			 * reclaim).
2085 			 */
2086 			pmap_disconnect(ppn);
2087 			if (ml_static_protect(vaddr_papt, PAGE_SIZE, VM_PROT_WRITE | VM_PROT_READ) != KERN_SUCCESS) {
2088 				panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
2089 			}
2090 
2091 			paddr_cur = ptoa(ppn);
2092 
2093 
2094 			vm_page_create_canonical(ppn);
2095 			freed_pages++;
2096 #if defined(CONFIG_SPTM)
2097 			if (ml_physaddr_in_bootkc_range(paddr_cur))
2098 #else
2099 			if (paddr_cur >= arm_vm_kernelcache_phys_start && paddr_cur < arm_vm_kernelcache_phys_end)
2100 #endif
2101 			{
2102 				freed_kernelcache_pages++;
2103 			}
2104 		}
2105 	}
2106 
2107 	vm_page_lockspin_queues();
2108 	vm_page_wire_count -= freed_pages;
2109 	vm_page_wire_count_initial -= freed_pages;
2110 	vm_page_kernelcache_count -= freed_kernelcache_pages;
2111 	vm_page_unlock_queues();
2112 #if     DEBUG
2113 	kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
2114 #endif
2115 }
2116 
2117 /*
2118  * Routine: ml_page_protection_type
2119  * Function: Returns the type of page protection that the system supports.
2120  */
2121 ml_page_protection_t
ml_page_protection_type(void)2122 ml_page_protection_type(void)
2123 {
2124 #if CONFIG_SPTM
2125 	return 2;
2126 #elif XNU_MONITOR
2127 	return 1;
2128 #else
2129 	return 0;
2130 #endif
2131 }
2132 
2133 /* virtual to physical on wired pages */
2134 vm_offset_t
ml_vtophys(vm_offset_t vaddr)2135 ml_vtophys(vm_offset_t vaddr)
2136 {
2137 	return kvtophys(vaddr);
2138 }
2139 
2140 /*
2141  * Routine: ml_nofault_copy
2142  * Function: Perform a physical mode copy if the source and destination have
2143  * valid translations in the kernel pmap. If translations are present, they are
2144  * assumed to be wired; e.g., no attempt is made to guarantee that the
2145  * translations obtained remain valid for the duration of the copy process.
2146  */
2147 vm_size_t
ml_nofault_copy(vm_offset_t virtsrc,vm_offset_t virtdst,vm_size_t size)2148 ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
2149 {
2150 	addr64_t        cur_phys_dst, cur_phys_src;
2151 	vm_size_t       count, nbytes = 0;
2152 
2153 	while (size > 0) {
2154 		if (!(cur_phys_src = kvtophys(virtsrc))) {
2155 			break;
2156 		}
2157 		if (!(cur_phys_dst = kvtophys(virtdst))) {
2158 			break;
2159 		}
2160 		if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) ||
2161 		    !pmap_valid_address(trunc_page_64(cur_phys_src))) {
2162 			break;
2163 		}
2164 		count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
2165 		if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
2166 			count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
2167 		}
2168 		if (count > size) {
2169 			count = size;
2170 		}
2171 
2172 		bcopy_phys(cur_phys_src, cur_phys_dst, count);
2173 
2174 		nbytes += count;
2175 		virtsrc += count;
2176 		virtdst += count;
2177 		size -= count;
2178 	}
2179 
2180 	return nbytes;
2181 }
2182 
2183 /*
2184  *	Routine:        ml_validate_nofault
2185  *	Function: Validate that ths address range has a valid translations
2186  *			in the kernel pmap.  If translations are present, they are
2187  *			assumed to be wired; i.e. no attempt is made to guarantee
2188  *			that the translation persist after the check.
2189  *  Returns: TRUE if the range is mapped and will not cause a fault,
2190  *			FALSE otherwise.
2191  */
2192 
2193 boolean_t
ml_validate_nofault(vm_offset_t virtsrc,vm_size_t size)2194 ml_validate_nofault(
2195 	vm_offset_t virtsrc, vm_size_t size)
2196 {
2197 	addr64_t cur_phys_src;
2198 	uint32_t count;
2199 
2200 	while (size > 0) {
2201 		if (!(cur_phys_src = kvtophys(virtsrc))) {
2202 			return FALSE;
2203 		}
2204 		if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
2205 			return FALSE;
2206 		}
2207 		count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
2208 		if (count > size) {
2209 			count = (uint32_t)size;
2210 		}
2211 
2212 		virtsrc += count;
2213 		size -= count;
2214 	}
2215 
2216 	return TRUE;
2217 }
2218 
2219 void
ml_get_bouncepool_info(vm_offset_t * phys_addr,vm_size_t * size)2220 ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
2221 {
2222 	*phys_addr = 0;
2223 	*size = 0;
2224 }
2225 
2226 void
active_rt_threads(__unused boolean_t active)2227 active_rt_threads(__unused boolean_t active)
2228 {
2229 }
2230 
2231 static void
cpu_qos_cb_default(__unused int urgency,__unused uint64_t qos_param1,__unused uint64_t qos_param2)2232 cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
2233 {
2234 	return;
2235 }
2236 
2237 cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
2238 
2239 void
cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)2240 cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
2241 {
2242 	if (cpu_qos_cb != NULL) {
2243 		cpu_qos_update = cpu_qos_cb;
2244 	} else {
2245 		cpu_qos_update = cpu_qos_cb_default;
2246 	}
2247 }
2248 
2249 void
thread_tell_urgency(thread_urgency_t urgency,uint64_t rt_period,uint64_t rt_deadline,uint64_t sched_latency __unused,__unused thread_t nthread)2250 thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
2251 {
2252 	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
2253 
2254 	cpu_qos_update((int)urgency, rt_period, rt_deadline);
2255 
2256 	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
2257 }
2258 
2259 void
machine_run_count(__unused uint32_t count)2260 machine_run_count(__unused uint32_t count)
2261 {
2262 }
2263 
2264 #if KASAN
2265 vm_offset_t ml_stack_base(void);
2266 vm_size_t ml_stack_size(void);
2267 
2268 vm_offset_t
ml_stack_base(void)2269 ml_stack_base(void)
2270 {
2271 	uintptr_t local = (uintptr_t) &local;
2272 	vm_offset_t     intstack_top_ptr;
2273 
2274 	intstack_top_ptr = getCpuDatap()->intstack_top;
2275 	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2276 		return intstack_top_ptr - INTSTACK_SIZE;
2277 	} else {
2278 		return current_thread()->kernel_stack;
2279 	}
2280 }
2281 vm_size_t
ml_stack_size(void)2282 ml_stack_size(void)
2283 {
2284 	uintptr_t local = (uintptr_t) &local;
2285 	vm_offset_t     intstack_top_ptr;
2286 
2287 	intstack_top_ptr = getCpuDatap()->intstack_top;
2288 	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2289 		return INTSTACK_SIZE;
2290 	} else {
2291 		return kernel_stack_size;
2292 	}
2293 }
2294 #endif
2295 
2296 #ifdef CONFIG_KCOV
2297 
2298 kcov_cpu_data_t *
current_kcov_data(void)2299 current_kcov_data(void)
2300 {
2301 	return &current_cpu_datap()->cpu_kcov_data;
2302 }
2303 
2304 kcov_cpu_data_t *
cpu_kcov_data(int cpuid)2305 cpu_kcov_data(int cpuid)
2306 {
2307 	return &cpu_datap(cpuid)->cpu_kcov_data;
2308 }
2309 
2310 #endif /* CONFIG_KCOV */
2311 
2312 boolean_t
machine_timeout_suspended(void)2313 machine_timeout_suspended(void)
2314 {
2315 	return FALSE;
2316 }
2317 
2318 kern_return_t
ml_interrupt_prewarm(__unused uint64_t deadline)2319 ml_interrupt_prewarm(__unused uint64_t deadline)
2320 {
2321 	return KERN_FAILURE;
2322 }
2323 
2324 #if HAS_APPLE_GENERIC_TIMER
2325 /* The kernel timer APIs always use the Apple timebase */
2326 #define KERNEL_CNTV_TVAL_EL0 "S3_1_C15_C15_4"
2327 #define KERNEL_CNTVCT_EL0    "S3_4_C15_C11_7"
2328 #define KERNEL_CNTVCTSS_EL0  "S3_4_C15_C10_6"
2329 #define KERNEL_CNTV_CTL_EL0  "S3_1_C15_C0_5"
2330 #define KERNEL_CNTKCTL_EL1   "S3_4_C15_C9_6"
2331 #else
2332 #define KERNEL_CNTV_TVAL_EL0 "CNTV_TVAL_EL0"
2333 #define KERNEL_CNTVCT_EL0    "CNTVCT_EL0"
2334 #define KERNEL_CNTVCTSS_EL0  "CNTVCTSS_EL0"
2335 #define KERNEL_CNTV_CTL_EL0  "CNTV_CTL_EL0"
2336 #define KERNEL_CNTKCTL_EL1   "CNTKCTL_EL1"
2337 #endif
2338 
2339 /*
2340  * Assumes fiq, irq disabled.
2341  */
2342 void
ml_set_decrementer(uint32_t dec_value)2343 ml_set_decrementer(uint32_t dec_value)
2344 {
2345 	cpu_data_t      *cdp = getCpuDatap();
2346 
2347 	assert(ml_get_interrupts_enabled() == FALSE);
2348 	cdp->cpu_decrementer = dec_value;
2349 
2350 	if (cdp->cpu_set_decrementer_func) {
2351 		cdp->cpu_set_decrementer_func(dec_value);
2352 	} else {
2353 		__builtin_arm_wsr64(KERNEL_CNTV_TVAL_EL0, (uint64_t)dec_value);
2354 	}
2355 }
2356 
2357 /**
2358  * Perform a read of the timebase which is permitted to be executed
2359  * speculatively and/or out of program order.
2360  */
2361 static inline uint64_t
speculative_timebase(void)2362 speculative_timebase(void)
2363 {
2364 	return __builtin_arm_rsr64(KERNEL_CNTVCT_EL0);
2365 }
2366 
2367 /**
2368  * Read a non-speculative view of the timebase if one is available,
2369  * otherwise fallback on an ISB to prevent prevent speculation and
2370  * enforce ordering.
2371  */
2372 static inline uint64_t
nonspeculative_timebase(void)2373 nonspeculative_timebase(void)
2374 {
2375 #if   __ARM_ARCH_8_6__
2376 	return __builtin_arm_rsr64(KERNEL_CNTVCTSS_EL0);
2377 #else
2378 	// ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
2379 	// "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
2380 	// to other instructions executed on the same processor."
2381 	__builtin_arm_isb(ISB_SY);
2382 	return speculative_timebase();
2383 #endif
2384 }
2385 
2386 
2387 uint64_t
ml_get_hwclock()2388 ml_get_hwclock()
2389 {
2390 	uint64_t timebase = nonspeculative_timebase();
2391 	return timebase;
2392 }
2393 
2394 uint64_t
ml_get_hwclock_speculative()2395 ml_get_hwclock_speculative()
2396 {
2397 	uint64_t timebase = speculative_timebase();
2398 	return timebase;
2399 }
2400 
2401 uint64_t
ml_get_timebase()2402 ml_get_timebase()
2403 {
2404 	uint64_t clock, timebase;
2405 
2406 	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
2407 	do {
2408 		timebase = getCpuDatap()->cpu_base_timebase;
2409 		os_compiler_barrier();
2410 		clock = ml_get_hwclock();
2411 		os_compiler_barrier();
2412 	} while (getCpuDatap()->cpu_base_timebase != timebase);
2413 
2414 	return clock + timebase;
2415 }
2416 
2417 /**
2418  * Issue a barrier that guarantees all prior memory accesses will complete
2419  * before any subsequent timebase reads.
2420  */
2421 void
ml_memory_to_timebase_fence(void)2422 ml_memory_to_timebase_fence(void)
2423 {
2424 	__builtin_arm_dmb(DMB_SY);
2425 	const uint64_t take_backwards_branch = 0;
2426 	asm volatile (
2427         "1:"
2428                 "ldr	x0, [%[take_backwards_branch]]" "\n"
2429                 "cbnz	x0, 1b"                         "\n"
2430                 :
2431                 : [take_backwards_branch] "r"(&take_backwards_branch)
2432                 : "x0"
2433         );
2434 
2435 	/* throwaway read to prevent ml_get_speculative_timebase() reordering */
2436 	(void)ml_get_hwclock();
2437 }
2438 
2439 /**
2440  * Issue a barrier that guarantees all prior timebase reads will
2441  * be ordered before any subsequent memory accesses.
2442  */
2443 void
ml_timebase_to_memory_fence(void)2444 ml_timebase_to_memory_fence(void)
2445 {
2446 	__builtin_arm_isb(ISB_SY);
2447 }
2448 
2449 /*
2450  * Get the speculative timebase without an ISB.
2451  */
2452 uint64_t
ml_get_speculative_timebase(void)2453 ml_get_speculative_timebase(void)
2454 {
2455 	uint64_t clock, timebase;
2456 
2457 	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
2458 	do {
2459 		timebase = getCpuDatap()->cpu_base_timebase;
2460 		os_compiler_barrier();
2461 		clock = speculative_timebase();
2462 
2463 		os_compiler_barrier();
2464 	} while (getCpuDatap()->cpu_base_timebase != timebase);
2465 
2466 	return clock + timebase;
2467 }
2468 
2469 uint64_t
ml_get_timebase_entropy(void)2470 ml_get_timebase_entropy(void)
2471 {
2472 	return ml_get_speculative_timebase();
2473 }
2474 
2475 uint32_t
ml_get_decrementer(void)2476 ml_get_decrementer(void)
2477 {
2478 	cpu_data_t *cdp = getCpuDatap();
2479 	uint32_t dec;
2480 
2481 	assert(ml_get_interrupts_enabled() == FALSE);
2482 
2483 	if (cdp->cpu_get_decrementer_func) {
2484 		dec = cdp->cpu_get_decrementer_func();
2485 	} else {
2486 		uint64_t wide_val;
2487 
2488 		wide_val = __builtin_arm_rsr64(KERNEL_CNTV_TVAL_EL0);
2489 		dec = (uint32_t)wide_val;
2490 		assert(wide_val == (uint64_t)dec);
2491 	}
2492 
2493 	return dec;
2494 }
2495 
2496 boolean_t
ml_get_timer_pending(void)2497 ml_get_timer_pending(void)
2498 {
2499 	uint64_t cntv_ctl = __builtin_arm_rsr64(KERNEL_CNTV_CTL_EL0);
2500 	return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
2501 }
2502 
2503 __attribute__((noreturn))
2504 void
platform_syscall(arm_saved_state_t * state)2505 platform_syscall(arm_saved_state_t *state)
2506 {
2507 	uint32_t code;
2508 
2509 #define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2510 
2511 	code = (uint32_t)get_saved_state_reg(state, 3);
2512 
2513 	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_START,
2514 	    get_saved_state_reg(state, 0),
2515 	    get_saved_state_reg(state, 1),
2516 	    get_saved_state_reg(state, 2));
2517 
2518 	switch (code) {
2519 	case 2:
2520 		/* set cthread */
2521 		platform_syscall_kprintf("set cthread self.\n");
2522 		thread_set_cthread_self(get_saved_state_reg(state, 0));
2523 		break;
2524 	case 3:
2525 		/* get cthread */
2526 		platform_syscall_kprintf("get cthread self.\n");
2527 		set_user_saved_state_reg(state, 0, thread_get_cthread_self());
2528 		break;
2529 	case 0: /* I-Cache flush (removed) */
2530 	case 1: /* D-Cache flush (removed) */
2531 	default:
2532 		platform_syscall_kprintf("unknown: %d\n", code);
2533 		break;
2534 	}
2535 
2536 	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_END,
2537 	    get_saved_state_reg(state, 0));
2538 
2539 	thread_exception_return();
2540 }
2541 
2542 static void
_enable_timebase_event_stream(uint32_t bit_index)2543 _enable_timebase_event_stream(uint32_t bit_index)
2544 {
2545 	if (bit_index >= 64) {
2546 		panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2547 	}
2548 
2549 	uint64_t cntkctl = __builtin_arm_rsr64(KERNEL_CNTKCTL_EL1);
2550 
2551 	cntkctl |= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2552 	cntkctl |= CNTKCTL_EL1_EVNTEN;
2553 	cntkctl |= CNTKCTL_EL1_EVENTDIR; /* 1->0; why not? */
2554 
2555 	/*
2556 	 * If the SOC supports it (and it isn't broken), enable
2557 	 * EL0 access to the timebase registers.
2558 	 */
2559 	if (user_timebase_type() != USER_TIMEBASE_NONE) {
2560 		cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2561 	}
2562 
2563 	__builtin_arm_wsr64(KERNEL_CNTKCTL_EL1, cntkctl);
2564 
2565 #if HAS_APPLE_GENERIC_TIMER
2566 	/* Enable EL0 access to the ARM timebase registers too */
2567 	uint64_t arm_cntkctl = __builtin_arm_rsr64("CNTKCTL_EL1");
2568 	arm_cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2569 	__builtin_arm_wsr64("CNTKCTL_EL1", arm_cntkctl);
2570 #endif
2571 }
2572 
2573 /*
2574  * Turn timer on, unmask that interrupt.
2575  */
2576 static void
_enable_virtual_timer(void)2577 _enable_virtual_timer(void)
2578 {
2579 	uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
2580 
2581 	__builtin_arm_wsr64(KERNEL_CNTV_CTL_EL0, cntvctl);
2582 	/* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
2583 	__builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2584 #if HAS_APPLE_GENERIC_TIMER
2585 	__builtin_arm_wsr64("S3_1_C15_C13_4", CNTP_CTL_EL0_IMASKED);
2586 #endif
2587 }
2588 
2589 void
fiq_context_init(boolean_t enable_fiq __unused)2590 fiq_context_init(boolean_t enable_fiq __unused)
2591 {
2592 	/* Interrupts still disabled. */
2593 	assert(ml_get_interrupts_enabled() == FALSE);
2594 	_enable_virtual_timer();
2595 }
2596 
2597 void
wfe_timeout_init(void)2598 wfe_timeout_init(void)
2599 {
2600 	_enable_timebase_event_stream(arm64_eventi);
2601 }
2602 
2603 /**
2604  * Configures, but does not enable, the WFE event stream. The event stream
2605  * generates an event at a set interval to act as a timeout for WFEs.
2606  *
2607  * This function sets the static global variable arm64_eventi to be the proper
2608  * bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2609  * period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2610  * is used by wfe_timeout_init to actually poke the registers and enable the
2611  * event stream.
2612  *
2613  * The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2614  * is the trigger for the system to generate an event. The trigger can occur on
2615  * either the rising or falling edge of the bit depending on the value of
2616  * CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2617  * falling edge (1->0) transition to generate events.
2618  */
2619 void
wfe_timeout_configure(void)2620 wfe_timeout_configure(void)
2621 {
2622 	/* Could fill in our own ops here, if we needed them */
2623 	uint64_t        ticks_per_sec, ticks_per_event, events_per_sec = 0;
2624 	uint32_t        bit_index;
2625 
2626 	if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
2627 		if (events_per_sec <= 0) {
2628 			events_per_sec = 1;
2629 		} else if (events_per_sec > USEC_PER_SEC) {
2630 			events_per_sec = USEC_PER_SEC;
2631 		}
2632 	} else {
2633 		events_per_sec = USEC_PER_SEC;
2634 	}
2635 	ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2636 	ticks_per_event = ticks_per_sec / events_per_sec;
2637 
2638 	/* Bit index of next power of two greater than ticks_per_event */
2639 	bit_index = flsll(ticks_per_event) - 1;
2640 	/* Round up to next power of two if ticks_per_event is initially power of two */
2641 	if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
2642 		bit_index++;
2643 	}
2644 
2645 	/*
2646 	 * The timer can only trigger on rising or falling edge, not both; we don't
2647 	 * care which we trigger on, but we do need to adjust which bit we are
2648 	 * interested in to account for this.
2649 	 *
2650 	 * In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2651 	 * falling edge of the given bit. Therefore, we must decrement the bit index
2652 	 * by one as when the bit before the one we care about makes a 1 -> 0
2653 	 * transition, the bit we care about makes a 0 -> 1 transition.
2654 	 *
2655 	 * For example if we want an event generated every 8 ticks (if we calculated
2656 	 * a bit_index of 3), we would want the event to be generated whenever the
2657 	 * lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2658 	 * see that the bit at index 2 makes a falling transition in this scenario,
2659 	 * so we would want EVENTI to be 2 instead of 3.
2660 	 */
2661 	if (bit_index != 0) {
2662 		bit_index--;
2663 	}
2664 
2665 	arm64_eventi = bit_index;
2666 }
2667 
2668 boolean_t
ml_delay_should_spin(uint64_t interval)2669 ml_delay_should_spin(uint64_t interval)
2670 {
2671 	cpu_data_t     *cdp = getCpuDatap();
2672 
2673 	if (cdp->cpu_idle_latency) {
2674 		return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2675 	} else {
2676 		/*
2677 		 * Early boot, latency is unknown. Err on the side of blocking,
2678 		 * which should always be safe, even if slow
2679 		 */
2680 		return FALSE;
2681 	}
2682 }
2683 
2684 boolean_t
ml_thread_is64bit(thread_t thread)2685 ml_thread_is64bit(thread_t thread)
2686 {
2687 	return thread_is_64bit_addr(thread);
2688 }
2689 
2690 void
ml_delay_on_yield(void)2691 ml_delay_on_yield(void)
2692 {
2693 #if DEVELOPMENT || DEBUG
2694 	if (yield_delay_us) {
2695 		delay(yield_delay_us);
2696 	}
2697 #endif
2698 }
2699 
2700 void
ml_timer_evaluate(void)2701 ml_timer_evaluate(void)
2702 {
2703 }
2704 
2705 boolean_t
ml_timer_forced_evaluation(void)2706 ml_timer_forced_evaluation(void)
2707 {
2708 	return FALSE;
2709 }
2710 
2711 void
ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)2712 ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2713 {
2714 	/*
2715 	 * For now: update the resource coalition stats of the
2716 	 * current thread's coalition
2717 	 */
2718 	task_coalition_update_gpu_stats(current_task(), gpu_ns_delta);
2719 }
2720 
2721 uint64_t
ml_gpu_stat(__unused thread_t t)2722 ml_gpu_stat(__unused thread_t t)
2723 {
2724 	return 0;
2725 }
2726 
2727 thread_t
current_thread(void)2728 current_thread(void)
2729 {
2730 	return current_thread_fast();
2731 }
2732 
2733 #if defined(HAS_APPLE_PAC)
2734 uint8_t
ml_task_get_disable_user_jop(task_t task)2735 ml_task_get_disable_user_jop(task_t task)
2736 {
2737 	assert(task);
2738 	return task->disable_user_jop;
2739 }
2740 
2741 void
ml_task_set_disable_user_jop(task_t task,uint8_t disable_user_jop)2742 ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2743 {
2744 	assert(task);
2745 	task->disable_user_jop = disable_user_jop;
2746 }
2747 
2748 void
ml_thread_set_disable_user_jop(thread_t thread,uint8_t disable_user_jop)2749 ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2750 {
2751 	assert(thread);
2752 	if (disable_user_jop) {
2753 		thread->machine.arm_machine_flags |= ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2754 	} else {
2755 		thread->machine.arm_machine_flags &= ~ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2756 	}
2757 }
2758 
2759 void
ml_task_set_rop_pid(task_t task,task_t parent_task,boolean_t inherit)2760 ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2761 {
2762 	if (inherit) {
2763 		task->rop_pid = parent_task->rop_pid;
2764 	} else {
2765 		task->rop_pid = early_random();
2766 	}
2767 }
2768 
2769 /**
2770  * jop_pid may be inherited from the parent task or generated inside the shared
2771  * region.  Unfortunately these two parameters are available at very different
2772  * times during task creation, so we need to split this into two steps.
2773  */
2774 void
ml_task_set_jop_pid(task_t task,task_t parent_task,boolean_t inherit,boolean_t disable_user_jop)2775 ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit, boolean_t disable_user_jop)
2776 {
2777 	if (inherit) {
2778 		task->jop_pid = parent_task->jop_pid;
2779 	} else if (disable_user_jop) {
2780 		task->jop_pid = ml_non_arm64e_user_jop_pid();
2781 	} else {
2782 		task->jop_pid = ml_default_jop_pid();
2783 	}
2784 }
2785 
2786 void
ml_task_set_jop_pid_from_shared_region(task_t task,boolean_t disable_user_jop)2787 ml_task_set_jop_pid_from_shared_region(task_t task, boolean_t disable_user_jop)
2788 {
2789 	if (disable_user_jop) {
2790 		task->jop_pid = ml_non_arm64e_user_jop_pid();
2791 		return;
2792 	}
2793 
2794 	vm_shared_region_t sr = vm_shared_region_get(task);
2795 	/*
2796 	 * If there's no shared region, we can assign the key arbitrarily.  This
2797 	 * typically happens when Mach-O image activation failed part of the way
2798 	 * through, and this task is in the middle of dying with SIGKILL anyway.
2799 	 */
2800 	if (__improbable(!sr)) {
2801 		task->jop_pid = early_random();
2802 		return;
2803 	}
2804 	vm_shared_region_deallocate(sr);
2805 
2806 	/*
2807 	 * Similarly we have to worry about jetsam having killed the task and
2808 	 * already cleared the shared_region_id.
2809 	 */
2810 	task_lock(task);
2811 	if (task->shared_region_id != NULL) {
2812 		task->jop_pid = shared_region_find_key(task->shared_region_id);
2813 	} else {
2814 		task->jop_pid = early_random();
2815 	}
2816 	task_unlock(task);
2817 }
2818 
2819 void
ml_thread_set_jop_pid(thread_t thread,task_t task)2820 ml_thread_set_jop_pid(thread_t thread, task_t task)
2821 {
2822 	thread->machine.jop_pid = task->jop_pid;
2823 }
2824 #endif /* defined(HAS_APPLE_PAC) */
2825 
2826 #if DEVELOPMENT || DEBUG
2827 static uint64_t minor_badness_suffered = 0;
2828 #endif
2829 void
ml_report_minor_badness(uint32_t __unused badness_id)2830 ml_report_minor_badness(uint32_t __unused badness_id)
2831 {
2832 	#if DEVELOPMENT || DEBUG
2833 	(void)os_atomic_or(&minor_badness_suffered, 1ULL << badness_id, relaxed);
2834 	#endif
2835 }
2836 
2837 #if HAS_APPLE_PAC
2838 /**
2839  * Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2840  */
2841 void *
ml_poison_ptr(void * ptr,ptrauth_key key)2842 ml_poison_ptr(void *ptr, ptrauth_key key)
2843 {
2844 	bool b_key = key & (1ULL << 0);
2845 	uint64_t error_code;
2846 	if (b_key) {
2847 		error_code = 2;
2848 	} else {
2849 		error_code = 1;
2850 	}
2851 
2852 	bool kernel_pointer = (uintptr_t)ptr & (1ULL << 55);
2853 	bool data_key = key & (1ULL << 1);
2854 	/* When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters */
2855 	bool tbi = data_key && !kernel_pointer;
2856 	unsigned int poison_shift;
2857 	if (tbi) {
2858 		poison_shift = 53;
2859 	} else {
2860 		poison_shift = 61;
2861 	}
2862 
2863 	uintptr_t poisoned = (uintptr_t)ptr;
2864 	poisoned &= ~(3ULL << poison_shift);
2865 	poisoned |= error_code << poison_shift;
2866 	return (void *)poisoned;
2867 }
2868 #endif /* HAS_APPLE_PAC */
2869 
2870 #ifdef CONFIG_XNUPOST
2871 void
ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_addr)2872 ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2873 {
2874 	thread_t thread = current_thread();
2875 	thread->machine.expected_fault_handler = expected_fault_handler;
2876 	thread->machine.expected_fault_addr = expected_fault_addr;
2877 	thread->machine.expected_fault_pc = 0;
2878 }
2879 
2880 /** Expect an exception to be thrown at EXPECTED_FAULT_PC */
2881 void
ml_expect_fault_pc_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_pc)2882 ml_expect_fault_pc_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_pc)
2883 {
2884 	thread_t thread = current_thread();
2885 	thread->machine.expected_fault_handler = expected_fault_handler;
2886 	thread->machine.expected_fault_addr = 0;
2887 	uintptr_t raw_func = (uintptr_t)ptrauth_strip(
2888 		(void *)expected_fault_pc,
2889 		ptrauth_key_function_pointer);
2890 	thread->machine.expected_fault_pc = raw_func;
2891 }
2892 
2893 void
ml_expect_fault_end(void)2894 ml_expect_fault_end(void)
2895 {
2896 	thread_t thread = current_thread();
2897 	thread->machine.expected_fault_handler = NULL;
2898 	thread->machine.expected_fault_addr = 0;
2899 	thread->machine.expected_fault_pc = 0;
2900 }
2901 #endif /* CONFIG_XNUPOST */
2902 
2903 void
ml_hibernate_active_pre(void)2904 ml_hibernate_active_pre(void)
2905 {
2906 #if HIBERNATION
2907 	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2908 
2909 		hibernate_rebuild_vm_structs();
2910 	}
2911 #endif /* HIBERNATION */
2912 }
2913 
2914 void
ml_hibernate_active_post(void)2915 ml_hibernate_active_post(void)
2916 {
2917 #if HIBERNATION
2918 	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2919 		hibernate_machine_init();
2920 		hibernate_vm_lock_end();
2921 		current_cpu_datap()->cpu_hibernate = 0;
2922 	}
2923 #endif /* HIBERNATION */
2924 }
2925 
2926 /**
2927  * Return back a machine-dependent array of address space regions that should be
2928  * reserved by the VM (pre-mapped in the address space). This will prevent user
2929  * processes from allocating or deallocating from within these regions.
2930  *
2931  * @param vm_is64bit True if the process has a 64-bit address space.
2932  * @param regions An out parameter representing an array of regions to reserve.
2933  *
2934  * @return The number of reserved regions returned through `regions`.
2935  */
2936 size_t
ml_get_vm_reserved_regions(bool vm_is64bit,const struct vm_reserved_region ** regions)2937 ml_get_vm_reserved_regions(bool vm_is64bit, const struct vm_reserved_region **regions)
2938 {
2939 	assert(regions != NULL);
2940 
2941 	/**
2942 	 * Reserved regions only apply to 64-bit address spaces. This is because
2943 	 * we only expect to grow the maximum user VA address on 64-bit address spaces
2944 	 * (we've essentially already reached the max for 32-bit spaces). The reserved
2945 	 * regions should safely fall outside of the max user VA for 32-bit processes.
2946 	 */
2947 	if (vm_is64bit) {
2948 		*regions = vm_reserved_regions;
2949 		return ARRAY_COUNT(vm_reserved_regions);
2950 	} else {
2951 		/* Don't reserve any VA regions on arm64_32 processes. */
2952 		*regions = NULL;
2953 		return 0;
2954 	}
2955 }
2956 
2957 /* These WFE recommendations are expected to be updated on a relatively
2958  * infrequent cadence, possibly from a different cluster, hence
2959  * false cacheline sharing isn't expected to be material
2960  */
2961 static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
2962 
2963 uint32_t
ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id,uint64_t wfe_timeout_abstime_interval,__unused uint64_t wfe_hint_flags)2964 ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
2965 {
2966 	assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
2967 	assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
2968 	os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
2969 	return 0; /* Success */
2970 }
2971 
2972 #if DEVELOPMENT || DEBUG
2973 int wfe_rec_max = 0;
2974 int wfe_rec_none = 0;
2975 uint64_t wfe_rec_override_mat = 0;
2976 uint64_t wfe_rec_clamp = 0;
2977 #endif
2978 
2979 uint64_t
ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)2980 ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
2981 {
2982 	/* This and its consumer does not synchronize vis-a-vis updates
2983 	 * of the recommendation; races are acceptable.
2984 	 */
2985 	uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
2986 #if DEVELOPMENT || DEBUG
2987 	if (wfe_rec_clamp) {
2988 		wfet = MIN(wfe_rec_clamp, wfet);
2989 	}
2990 
2991 	if (wfe_rec_max) {
2992 		for (int i = 0; i < MAX_CPU_CLUSTERS; i++) {
2993 			if (arm64_cluster_wfe_recs[i] > wfet) {
2994 				wfet = arm64_cluster_wfe_recs[i];
2995 			}
2996 		}
2997 	}
2998 
2999 	if (wfe_rec_none) {
3000 		wfet = 0;
3001 	}
3002 
3003 	if (wfe_rec_override_mat) {
3004 		wfet = wfe_rec_override_mat;
3005 	}
3006 #endif
3007 	return wfet;
3008 }
3009 
3010 __pure2 bool
ml_addr_in_non_xnu_stack(__unused uintptr_t addr)3011 ml_addr_in_non_xnu_stack(__unused uintptr_t addr)
3012 {
3013 #if CONFIG_SPTM
3014 	/**
3015 	 * If the address is within one of the SPTM-allocated per-cpu stacks, then
3016 	 * return true.
3017 	 */
3018 	if ((addr >= SPTMArgs->cpu_stack_papt_start) &&
3019 	    (addr < SPTMArgs->cpu_stack_papt_end)) {
3020 		return true;
3021 	}
3022 
3023 	/**
3024 	 * If the address is within one of the TXM thread stacks, then return true.
3025 	 * The SPTM guarantees that these stacks are virtually contiguous.
3026 	 */
3027 	if ((addr >= SPTMArgs->txm_thread_stacks[0]) &&
3028 	    (addr < SPTMArgs->txm_thread_stacks[MAX_CPUS - 1])) {
3029 		return true;
3030 	}
3031 
3032 	return false;
3033 #elif XNU_MONITOR
3034 	return (addr >= (uintptr_t)pmap_stacks_start) && (addr < (uintptr_t)pmap_stacks_end);
3035 #else
3036 	return false;
3037 #endif /* CONFIG_SPTM || XNU_MONITOR */
3038 }
3039 
3040 uint64_t
ml_get_backtrace_pc(struct arm_saved_state * state)3041 ml_get_backtrace_pc(struct arm_saved_state *state)
3042 {
3043 	assert((state != NULL) && is_saved_state64(state));
3044 
3045 #if CONFIG_SPTM
3046 	/**
3047 	 * On SPTM-based systems, when a non-XNU domain (e.g., SPTM) is interrupted,
3048 	 * the PC value saved into the state is not the actual PC at the interrupted
3049 	 * point, but a fixed value to a handler that knows how to re-enter the
3050 	 * interrupted domain. The interrupted domain's actual PC value is saved
3051 	 * into x14, so let's return that instead.
3052 	 */
3053 	if (ml_addr_in_non_xnu_stack(get_saved_state_fp(state))) {
3054 		return saved_state64(state)->x[14];
3055 	}
3056 #endif /* CONFIG_SPTM */
3057 
3058 	return get_saved_state_pc(state);
3059 }
3060 
3061 
3062 bool
ml_paddr_is_exclaves_owned(vm_offset_t paddr)3063 ml_paddr_is_exclaves_owned(vm_offset_t paddr)
3064 {
3065 #if CONFIG_SPTM
3066 	const sptm_frame_type_t type = sptm_get_frame_type(paddr);
3067 	return type == SK_DEFAULT || type == SK_IO;   // SK_SHARED_R[OW] are not exclusively exclaves frames
3068 #else
3069 	#pragma unused(paddr)
3070 	return false;
3071 #endif /* CONFIG_SPTM */
3072 }
3073 
3074 /**
3075  * Panic because an ARM saved-state accessor expected user saved-state but was
3076  * passed non-user saved-state.
3077  *
3078  * @param ss invalid saved-state (CPSR.M != EL0)
3079  */
3080 void
ml_panic_on_invalid_old_cpsr(const arm_saved_state_t * ss)3081 ml_panic_on_invalid_old_cpsr(const arm_saved_state_t *ss)
3082 {
3083 	panic("invalid CPSR in user saved-state %p", ss);
3084 }
3085 
3086 /**
3087  * Panic because an ARM saved-state accessor was passed user saved-state and
3088  * asked to assign a non-user CPSR.
3089  *
3090  * @param ss original EL0 saved-state
3091  * @param cpsr invalid new CPSR value (CPSR.M != EL0)
3092  */
3093 void
ml_panic_on_invalid_new_cpsr(const arm_saved_state_t * ss,uint32_t cpsr)3094 ml_panic_on_invalid_new_cpsr(const arm_saved_state_t *ss, uint32_t cpsr)
3095 {
3096 	panic("attempt to set non-user CPSR %#010x on user saved-state %p", cpsr, ss);
3097 }
3098 
3099 
3100 /**
3101  * Explicitly preallocates a floating point save area.
3102  * This is a noop on ARM because preallocation isn't required at this time.
3103  */
3104 void
ml_fp_save_area_prealloc(void)3105 ml_fp_save_area_prealloc(void)
3106 {
3107 }
3108 
3109 
3110 void
ml_task_post_signature_processing_hook(__unused task_t task)3111 ml_task_post_signature_processing_hook(__unused task_t task)
3112 {
3113 	/**
3114 	 * Have an acquire barrier here to make sure the machine flags read that is going
3115 	 * to happen below is not speculated before the task->t_returnwaitflags earlier
3116 	 * in task_wait_to_return().
3117 	 */
3118 	os_atomic_thread_fence(acquire);
3119 
3120 }
3121 
3122