xref: /xnu-8796.141.3/osfmk/arm64/machine_routines.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <arm64/machine_machdep.h>
30 #include <arm64/proc_reg.h>
31 #include <arm/machine_cpu.h>
32 #include <arm/cpu_internal.h>
33 #include <arm/cpuid.h>
34 #include <arm/cpu_data.h>
35 #include <arm/cpu_data_internal.h>
36 #include <arm/caches_internal.h>
37 #include <arm/misc_protos.h>
38 #include <arm/machdep_call.h>
39 #include <arm/machine_routines.h>
40 #include <arm/rtclock.h>
41 #include <arm/cpuid_internal.h>
42 #include <arm/cpu_capabilities.h>
43 #include <console/serial_protos.h>
44 #include <kern/machine.h>
45 #include <kern/misc_protos.h>
46 #include <prng/random.h>
47 #include <kern/startup.h>
48 #include <kern/thread.h>
49 #include <kern/timer_queue.h>
50 #include <mach/machine.h>
51 #include <machine/atomic.h>
52 #include <machine/config.h>
53 #include <vm/pmap.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_shared_region.h>
56 #include <vm/vm_map.h>
57 #include <sys/codesign.h>
58 #include <sys/kdebug.h>
59 #include <kern/coalition.h>
60 #include <pexpert/device_tree.h>
61 
62 #include <IOKit/IOPlatformExpert.h>
63 #if HIBERNATION
64 #include <IOKit/IOHibernatePrivate.h>
65 #endif /* HIBERNATION */
66 
67 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
68 #include <arm64/amcc_rorgn.h>
69 #endif
70 
71 
72 
73 #include <libkern/section_keywords.h>
74 
75 /**
76  * Release builds apply second stage locks, but astris needs to access
77  * DBG_WRAP* and ACC_OVRD in order to properly halt cores.
78  * This boot-arg will cause second stage lock to be skipped when running
79  * a release kernel on a PROD-fused SoC.
80  */
81 TUNABLE_WRITEABLE(boolean_t, skip_second_stage_lock_on_dev_fused, "skip_second_stage_lock", 0);
82 
83 /**
84  * On supported hardware, debuggable builds make the HID bits read-only
85  * without locking them.  This lets people manually modify HID bits while
86  * debugging, since they can use a debugging tool to first reset the HID
87  * bits back to read/write.  However it will still catch xnu changes that
88  * accidentally write to HID bits after they've been made read-only.
89  */
90 
91 #if KPC
92 #include <kern/kpc.h>
93 #endif
94 
95 #define MPIDR_CPU_ID(mpidr_el1_val)             (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
96 #define MPIDR_CLUSTER_ID(mpidr_el1_val)         (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
97 
98 #if HAS_CLUSTER
99 static uint8_t cluster_initialized = 0;
100 #endif
101 
102 MACHINE_TIMEOUT_DEV_WRITEABLE(LockTimeOut, "lock", 6e6 /* 0.25s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
103 machine_timeout_t LockTimeOutUsec; // computed in ml_init_lock_timeout
104 
105 MACHINE_TIMEOUT_DEV_WRITEABLE(TLockTimeOut, "ticket-lock", 3e6 /* 0.125s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
106 
107 MACHINE_TIMEOUT_DEV_WRITEABLE(MutexSpin, "mutex-spin", 240 /* 10us */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
108 
109 uint64_t low_MutexSpin;
110 int64_t high_MutexSpin;
111 
112 
113 
114 static uint64_t ml_wfe_hint_max_interval;
115 #define MAX_WFE_HINT_INTERVAL_US (500ULL)
116 
117 /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
118 TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
119 
120 extern vm_offset_t   segLOWEST;
121 extern vm_offset_t   segLOWESTTEXT;
122 extern vm_offset_t   segLASTB;
123 extern unsigned long segSizeLAST;
124 
125 /* ARM64 specific bounds; used to test for presence in the kernelcache. */
126 extern vm_offset_t   vm_kernelcache_base;
127 extern vm_offset_t   vm_kernelcache_top;
128 
129 extern vm_offset_t arm_vm_kernelcache_phys_start;
130 extern vm_offset_t arm_vm_kernelcache_phys_end;
131 
132 #if defined(HAS_IPI)
133 unsigned int gFastIPI = 1;
134 #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
135 static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
136     kDeferredIPITimerDefault);
137 #endif /* defined(HAS_IPI) */
138 
139 thread_t Idle_context(void);
140 
141 SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
142 
143 SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
144 SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
145 SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
146 	.version = CPU_TOPOLOGY_VERSION,
147 	.cpus = topology_cpu_array,
148 	.clusters = topology_cluster_array,
149 };
150 
151 _Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
152 
153 /**
154  * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
155  * entries of an arbitrary data type.  This is intended for use by specialized consumers
156  * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
157  * as follows:
158  *	hypothetical_array[cluster_offsets[AFF1] + AFF0]
159  * Most consumers should instead use general-purpose facilities such as PERCPU or
160  * ml_get_cpu_number().
161  */
162 SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
163 
164 SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
165 
166 extern uint32_t lockdown_done;
167 
168 /**
169  * Represents regions of virtual address space that should be reserved
170  * (pre-mapped) in each user address space.
171  */
172 static const struct vm_reserved_region vm_reserved_regions[] = {
173 	{
174 		.vmrr_name = "GPU Carveout",
175 		.vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
176 		.vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
177 	},
178 	/*
179 	 * Reserve the virtual memory space representing the commpage nesting region
180 	 * to prevent user processes from allocating memory within it. The actual
181 	 * page table entries for the commpage are inserted by vm_commpage_enter().
182 	 * This vm_map_enter() just prevents userspace from allocating/deallocating
183 	 * anything within the entire commpage nested region.
184 	 */
185 	{
186 		.vmrr_name = "commpage nesting",
187 		.vmrr_addr = _COMM_PAGE64_NESTING_START,
188 		.vmrr_size = _COMM_PAGE64_NESTING_SIZE
189 	}
190 };
191 
192 uint32_t get_arm_cpu_version(void);
193 
194 #if defined(HAS_IPI)
195 static inline void
ml_cpu_signal_type(unsigned int cpu_mpidr,uint32_t type)196 ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
197 {
198 #if HAS_CLUSTER
199 	uint64_t local_mpidr;
200 	/* NOTE: this logic expects that we are called in a non-preemptible
201 	 * context, or at least one in which the calling thread is bound
202 	 * to a single CPU.  Otherwise we may migrate between choosing which
203 	 * IPI mechanism to use and issuing the IPI. */
204 	MRS(local_mpidr, "MPIDR_EL1");
205 	if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
206 		uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
207 		MSR("S3_5_C15_C0_0", x);
208 	} else {
209 		#define IPI_RR_TARGET_CLUSTER_SHIFT 16
210 		uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
211 		MSR("S3_5_C15_C0_1", x);
212 	}
213 #else
214 	uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
215 	MSR("S3_5_C15_C0_1", x);
216 #endif
217 }
218 #endif
219 
220 #if !defined(HAS_IPI)
221 __dead2
222 #endif
223 void
ml_cpu_signal(unsigned int cpu_mpidr __unused)224 ml_cpu_signal(unsigned int cpu_mpidr __unused)
225 {
226 #if defined(HAS_IPI)
227 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
228 #else
229 	panic("Platform does not support ACC Fast IPI");
230 #endif
231 }
232 
233 #if !defined(HAS_IPI)
234 __dead2
235 #endif
236 void
ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)237 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
238 {
239 #if defined(HAS_IPI)
240 	/* adjust IPI_CR timer countdown value for deferred IPI
241 	 * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
242 	 * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
243 	 *
244 	 * global register, should only require a single write to update all
245 	 * CPU cores: from Skye ACC user spec section 5.7.3.3
246 	 *
247 	 * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
248 	 * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
249 	 */
250 	uint64_t abstime;
251 
252 	nanoseconds_to_absolutetime(nanosecs, &abstime);
253 
254 	abstime = MIN(abstime, 0xFFFF);
255 
256 	/* update deferred_ipi_timer_ns with the new clamped value */
257 	absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
258 
259 	MSR("S3_5_C15_C3_1", abstime);
260 #else
261 	(void)nanosecs;
262 	panic("Platform does not support ACC Fast IPI");
263 #endif
264 }
265 
266 uint64_t
ml_cpu_signal_deferred_get_timer()267 ml_cpu_signal_deferred_get_timer()
268 {
269 #if defined(HAS_IPI)
270 	return deferred_ipi_timer_ns;
271 #else
272 	return 0;
273 #endif
274 }
275 
276 #if !defined(HAS_IPI)
277 __dead2
278 #endif
279 void
ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)280 ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
281 {
282 #if defined(HAS_IPI)
283 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
284 #else
285 	panic("Platform does not support ACC Fast IPI deferral");
286 #endif
287 }
288 
289 #if !defined(HAS_IPI)
290 __dead2
291 #endif
292 void
ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)293 ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
294 {
295 #if defined(HAS_IPI)
296 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
297 #else
298 	panic("Platform does not support ACC Fast IPI retraction");
299 #endif
300 }
301 
302 extern uint32_t idle_proximate_io_wfe_unmasked;
303 
304 #define CPUPM_IDLE_WFE 0x5310300
305 static bool
wfe_process_recommendation(void)306 wfe_process_recommendation(void)
307 {
308 	bool ipending = false;
309 	if (__probable(idle_proximate_io_wfe_unmasked == 1)) {
310 		/* Check for an active perf. controller generated
311 		 * WFE recommendation for this cluster.
312 		 */
313 		cpu_data_t *cdp = getCpuDatap();
314 		uint32_t cid = cdp->cpu_cluster_id;
315 		uint64_t wfe_ttd = 0;
316 		uint64_t wfe_deadline = 0;
317 
318 		if ((wfe_ttd = ml_cluster_wfe_timeout(cid)) != 0) {
319 			wfe_deadline = mach_absolute_time() + wfe_ttd;
320 		}
321 
322 		if (wfe_deadline != 0) {
323 			/* Poll issuing event-bounded WFEs until an interrupt
324 			 * arrives or the WFE recommendation expires
325 			 */
326 #if DEVELOPMENT || DEBUG
327 			uint64_t wc = cdp->wfe_count;
328 			KDBG(CPUPM_IDLE_WFE | DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
329 #endif
330 			/* Issue WFE until the recommendation expires,
331 			 * with IRQs unmasked.
332 			 */
333 			ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true);
334 #if DEVELOPMENT || DEBUG
335 			KDBG(CPUPM_IDLE_WFE | DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
336 #endif
337 		}
338 	}
339 	return ipending;
340 }
341 
342 void
machine_idle(void)343 machine_idle(void)
344 {
345 	/* Interrupts are expected to be masked on entry or re-entry via
346 	 * Idle_load_context()
347 	 */
348 	assert((__builtin_arm_rsr("DAIF") & (DAIF_IRQF | DAIF_FIQF)) == (DAIF_IRQF | DAIF_FIQF));
349 	/* Check for, and act on, a WFE recommendation.
350 	 * Bypasses context spill/fill for a minor perf. increment.
351 	 * May unmask and restore IRQ+FIQ mask.
352 	 */
353 	if (wfe_process_recommendation() == false) {
354 		/* If WFE recommendation absent, or WFE deadline
355 		 * arrived with no interrupt pending/processed,
356 		 * fall back to WFI.
357 		 */
358 		Idle_context();
359 	}
360 	__builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
361 }
362 
363 void
OSSynchronizeIO(void)364 OSSynchronizeIO(void)
365 {
366 	__builtin_arm_dsb(DSB_SY);
367 }
368 
369 uint64_t
get_aux_control(void)370 get_aux_control(void)
371 {
372 	uint64_t        value;
373 
374 	MRS(value, "ACTLR_EL1");
375 	return value;
376 }
377 
378 uint64_t
get_mmu_control(void)379 get_mmu_control(void)
380 {
381 	uint64_t        value;
382 
383 	MRS(value, "SCTLR_EL1");
384 	return value;
385 }
386 
387 uint64_t
get_tcr(void)388 get_tcr(void)
389 {
390 	uint64_t        value;
391 
392 	MRS(value, "TCR_EL1");
393 	return value;
394 }
395 
396 boolean_t
ml_get_interrupts_enabled(void)397 ml_get_interrupts_enabled(void)
398 {
399 	uint64_t        value;
400 
401 	MRS(value, "DAIF");
402 	if (value & DAIF_IRQF) {
403 		return FALSE;
404 	}
405 	return TRUE;
406 }
407 
408 pmap_paddr_t
get_mmu_ttb(void)409 get_mmu_ttb(void)
410 {
411 	pmap_paddr_t    value;
412 
413 	MRS(value, "TTBR0_EL1");
414 	return value;
415 }
416 
417 uint32_t
get_arm_cpu_version(void)418 get_arm_cpu_version(void)
419 {
420 	uint32_t value = machine_read_midr();
421 
422 	/* Compose the register values into 8 bits; variant[7:4], revision[3:0]. */
423 	return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
424 }
425 
426 bool
ml_feature_supported(uint32_t feature_bit)427 ml_feature_supported(uint32_t feature_bit)
428 {
429 	uint64_t aidr_el1_value = 0;
430 
431 	MRS(aidr_el1_value, "AIDR_EL1");
432 
433 
434 	return aidr_el1_value & feature_bit;
435 }
436 
437 /*
438  * user_cont_hwclock_allowed()
439  *
440  * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
441  * as a continuous time source (e.g. from mach_continuous_time)
442  */
443 boolean_t
user_cont_hwclock_allowed(void)444 user_cont_hwclock_allowed(void)
445 {
446 #if HAS_CONTINUOUS_HWCLOCK
447 	return TRUE;
448 #else
449 	return FALSE;
450 #endif
451 }
452 
453 /*
454  * user_timebase_type()
455  *
456  * Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
457  *
458  * USER_TIMEBASE_NONE: EL0 has no access to timebase register
459  * USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
460  * USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
461  *
462  */
463 
464 uint8_t
user_timebase_type(void)465 user_timebase_type(void)
466 {
467 #if HAS_ACNTVCT
468 	return USER_TIMEBASE_NOSPEC_APPLE;
469 #elif __ARM_ARCH_8_6__
470 	return USER_TIMEBASE_NOSPEC;
471 #else
472 	return USER_TIMEBASE_SPEC;
473 #endif
474 }
475 
476 void
machine_startup(__unused boot_args * args)477 machine_startup(__unused boot_args * args)
478 {
479 #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
480 	if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
481 		gFastIPI = 1;
482 	}
483 #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
484 
485 
486 	machine_conf();
487 
488 
489 	/*
490 	 * Kick off the kernel bootstrap.
491 	 */
492 	kernel_bootstrap();
493 	/* NOTREACHED */
494 }
495 
496 typedef void (*invalidate_fn_t)(void);
497 
498 static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
499 
500 void set_invalidate_hmac_function(invalidate_fn_t fn);
501 
502 void
set_invalidate_hmac_function(invalidate_fn_t fn)503 set_invalidate_hmac_function(invalidate_fn_t fn)
504 {
505 	if (NULL != invalidate_hmac_function) {
506 		panic("Invalidate HMAC function already set");
507 	}
508 
509 	invalidate_hmac_function = fn;
510 }
511 
512 void
machine_lockdown(void)513 machine_lockdown(void)
514 {
515 	arm_vm_prot_finalize(PE_state.bootArgs);
516 
517 #if CONFIG_KERNEL_INTEGRITY
518 #if KERNEL_INTEGRITY_WT
519 	/* Watchtower
520 	 *
521 	 * Notify the monitor about the completion of early kernel bootstrap.
522 	 * From this point forward it will enforce the integrity of kernel text,
523 	 * rodata and page tables.
524 	 */
525 
526 #ifdef MONITOR
527 	monitor_call(MONITOR_LOCKDOWN, 0, 0, 0);
528 #endif
529 #endif /* KERNEL_INTEGRITY_WT */
530 
531 #if XNU_MONITOR
532 	pmap_lockdown_ppl();
533 #endif
534 
535 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
536 	/* KTRR
537 	 *
538 	 * Lock physical KTRR region. KTRR region is read-only. Memory outside
539 	 * the region is not executable at EL1.
540 	 */
541 
542 	rorgn_lockdown();
543 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
544 
545 #endif /* CONFIG_KERNEL_INTEGRITY */
546 
547 
548 	if (NULL != invalidate_hmac_function) {
549 		invalidate_hmac_function();
550 	}
551 
552 	lockdown_done = 1;
553 }
554 
555 
556 char           *
machine_boot_info(__unused char * buf,__unused vm_size_t size)557 machine_boot_info(
558 	__unused char *buf,
559 	__unused vm_size_t size)
560 {
561 	return PE_boot_args();
562 }
563 
564 void
slave_machine_init(__unused void * param)565 slave_machine_init(__unused void *param)
566 {
567 	cpu_machine_init();     /* Initialize the processor */
568 	clock_init();           /* Init the clock */
569 }
570 
571 /*
572  *	Routine:        machine_processor_shutdown
573  *	Function:
574  */
575 thread_t
machine_processor_shutdown(__unused thread_t thread,void (* doshutdown)(processor_t),processor_t processor)576 machine_processor_shutdown(
577 	__unused thread_t thread,
578 	void (*doshutdown)(processor_t),
579 	processor_t processor)
580 {
581 	return Shutdown_context(doshutdown, processor);
582 }
583 
584 /*
585  *      Routine:        ml_init_lock_timeout
586  *      Function:
587  */
588 static void __startup_func
ml_init_lock_timeout(void)589 ml_init_lock_timeout(void)
590 {
591 	/*
592 	 * This function is called after STARTUP_SUB_TIMEOUTS
593 	 * initialization, so using the "legacy" boot-args here overrides
594 	 * the ml-timeout-...  configuration. (Given that these boot-args
595 	 * here are usually explicitly specified, this makes sense by
596 	 * overriding ml-timeout-..., which may come from the device tree.
597 	 */
598 
599 	uint64_t lto_timeout_ns;
600 	uint64_t lto_abstime;
601 	uint32_t slto;
602 
603 	if (PE_parse_boot_argn("slto_us", &slto, sizeof(slto))) {
604 		lto_timeout_ns = slto * NSEC_PER_USEC;
605 		nanoseconds_to_absolutetime(lto_timeout_ns, &lto_abstime);
606 		os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
607 	} else {
608 		lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
609 		absolutetime_to_nanoseconds(lto_abstime, &lto_timeout_ns);
610 	}
611 
612 	os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
613 
614 	if (PE_parse_boot_argn("tlto_us", &slto, sizeof(slto))) {
615 		nanoseconds_to_absolutetime(slto * NSEC_PER_USEC, &lto_abstime);
616 		os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
617 	} else if (lto_abstime != 0) {
618 		os_atomic_store(&TLockTimeOut, lto_abstime >> 1, relaxed);
619 	} // else take default from MACHINE_TIMEOUT.
620 
621 	uint64_t mtxspin;
622 	uint64_t mtx_abstime;
623 	if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof(mtxspin))) {
624 		if (mtxspin > USEC_PER_SEC >> 4) {
625 			mtxspin =  USEC_PER_SEC >> 4;
626 		}
627 		nanoseconds_to_absolutetime(mtxspin * NSEC_PER_USEC, &mtx_abstime);
628 		os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
629 	} else {
630 		mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
631 	}
632 
633 	low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
634 	/*
635 	 * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
636 	 * real_ncpus is not set at this time
637 	 *
638 	 * NOTE: active spinning is disabled in arm. It can be activated
639 	 * by setting high_MutexSpin through the sysctl.
640 	 */
641 	high_MutexSpin = low_MutexSpin;
642 
643 	uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
644 	PE_parse_boot_argn("max_wfe_us", &maxwfeus, sizeof(maxwfeus));
645 	nanoseconds_to_absolutetime(maxwfeus * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
646 }
647 STARTUP(TIMEOUTS, STARTUP_RANK_MIDDLE, ml_init_lock_timeout);
648 
649 
650 /*
651  * This is called when all of the ml_processor_info_t structures have been
652  * initialized and all the processors have been started through processor_start().
653  *
654  * Required by the scheduler subsystem.
655  */
656 void
ml_cpu_init_completed(void)657 ml_cpu_init_completed(void)
658 {
659 	if (SCHED(cpu_init_completed) != NULL) {
660 		SCHED(cpu_init_completed)();
661 	}
662 }
663 
664 /*
665  * These are called from the machine-independent routine cpu_up()
666  * to perform machine-dependent info updates.
667  *
668  * The update to CPU counts needs to be separate from other actions
669  * because we don't update the counts when CLPC causes temporary
670  * cluster powerdown events, as these must be transparent to the user.
671  */
672 void
ml_cpu_up(void)673 ml_cpu_up(void)
674 {
675 }
676 
677 void
ml_cpu_up_update_counts(int cpu_id)678 ml_cpu_up_update_counts(int cpu_id)
679 {
680 	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
681 
682 	os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
683 
684 	os_atomic_inc(&machine_info.physical_cpu, relaxed);
685 	os_atomic_inc(&machine_info.logical_cpu, relaxed);
686 }
687 
688 /*
689  * These are called from the machine-independent routine cpu_down()
690  * to perform machine-dependent info updates.
691  *
692  * The update to CPU counts needs to be separate from other actions
693  * because we don't update the counts when CLPC causes temporary
694  * cluster powerdown events, as these must be transparent to the user.
695  */
696 void
ml_cpu_down(void)697 ml_cpu_down(void)
698 {
699 	/*
700 	 * If we want to deal with outstanding IPIs, we need to
701 	 * do relatively early in the processor_doshutdown path,
702 	 * as we pend decrementer interrupts using the IPI
703 	 * mechanism if we cannot immediately service them (if
704 	 * IRQ is masked).  Do so now.
705 	 *
706 	 * We aren't on the interrupt stack here; would it make
707 	 * more sense to disable signaling and then enable
708 	 * interrupts?  It might be a bit cleaner.
709 	 */
710 	cpu_data_t *cpu_data_ptr = getCpuDatap();
711 	cpu_data_ptr->cpu_running = FALSE;
712 
713 	if (cpu_data_ptr != &BootCpuData) {
714 		/*
715 		 * Move all of this cpu's timers to the master/boot cpu,
716 		 * and poke it in case there's a sooner deadline for it to schedule.
717 		 */
718 		timer_queue_shutdown(&cpu_data_ptr->rtclock_timer.queue);
719 		kern_return_t rv = cpu_xcall(BootCpuData.cpu_number, &timer_queue_expire_local, &ml_cpu_down);
720 		if (rv != KERN_SUCCESS) {
721 			panic("ml_cpu_down: IPI failure %d", rv);
722 		}
723 	}
724 
725 	cpu_signal_handler_internal(TRUE);
726 }
727 void
ml_cpu_down_update_counts(int cpu_id)728 ml_cpu_down_update_counts(int cpu_id)
729 {
730 	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
731 
732 	os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
733 
734 	os_atomic_dec(&machine_info.physical_cpu, relaxed);
735 	os_atomic_dec(&machine_info.logical_cpu, relaxed);
736 }
737 
738 
739 unsigned int
ml_get_machine_mem(void)740 ml_get_machine_mem(void)
741 {
742 	return machine_info.memory_size;
743 }
744 
745 __attribute__((noreturn))
746 void
halt_all_cpus(boolean_t reboot)747 halt_all_cpus(boolean_t reboot)
748 {
749 	if (reboot) {
750 		printf("MACH Reboot\n");
751 		PEHaltRestart(kPERestartCPU);
752 	} else {
753 		printf("CPU halted\n");
754 		PEHaltRestart(kPEHaltCPU);
755 	}
756 	while (1) {
757 		;
758 	}
759 }
760 
761 __attribute__((noreturn))
762 void
halt_cpu(void)763 halt_cpu(void)
764 {
765 	halt_all_cpus(FALSE);
766 }
767 
768 /*
769  *	Routine:        machine_signal_idle
770  *	Function:
771  */
772 void
machine_signal_idle(processor_t processor)773 machine_signal_idle(
774 	processor_t processor)
775 {
776 	cpu_signal(processor_to_cpu_datap(processor), SIGPnop, (void *)NULL, (void *)NULL);
777 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
778 }
779 
780 void
machine_signal_idle_deferred(processor_t processor)781 machine_signal_idle_deferred(
782 	processor_t processor)
783 {
784 	cpu_signal_deferred(processor_to_cpu_datap(processor));
785 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
786 }
787 
788 void
machine_signal_idle_cancel(processor_t processor)789 machine_signal_idle_cancel(
790 	processor_t processor)
791 {
792 	cpu_signal_cancel(processor_to_cpu_datap(processor));
793 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
794 }
795 
796 /*
797  *	Routine:        ml_install_interrupt_handler
798  *	Function:	Initialize Interrupt Handler
799  */
800 void
ml_install_interrupt_handler(void * nub,int source,void * target,IOInterruptHandler handler,void * refCon)801 ml_install_interrupt_handler(
802 	void *nub,
803 	int source,
804 	void *target,
805 	IOInterruptHandler handler,
806 	void *refCon)
807 {
808 	cpu_data_t     *cpu_data_ptr;
809 	boolean_t       current_state;
810 
811 	current_state = ml_set_interrupts_enabled(FALSE);
812 	cpu_data_ptr = getCpuDatap();
813 
814 	cpu_data_ptr->interrupt_nub = nub;
815 	cpu_data_ptr->interrupt_source = source;
816 	cpu_data_ptr->interrupt_target = target;
817 	cpu_data_ptr->interrupt_handler = handler;
818 	cpu_data_ptr->interrupt_refCon = refCon;
819 
820 	(void) ml_set_interrupts_enabled(current_state);
821 }
822 
823 /*
824  *	Routine:        ml_init_interrupt
825  *	Function:	Initialize Interrupts
826  */
827 void
ml_init_interrupt(void)828 ml_init_interrupt(void)
829 {
830 #if defined(HAS_IPI)
831 	/*
832 	 * ml_init_interrupt will get called once for each CPU, but this is redundant
833 	 * because there is only one global copy of the register for skye. do it only
834 	 * on the bootstrap cpu
835 	 */
836 	if (getCpuDatap()->cluster_master) {
837 		ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
838 	}
839 #endif
840 }
841 
842 /*
843  *	Routine:        ml_init_timebase
844  *	Function:	register and setup Timebase, Decremeter services
845  */
846 void
ml_init_timebase(void * args,tbd_ops_t tbd_funcs,vm_offset_t int_address,vm_offset_t int_value __unused)847 ml_init_timebase(
848 	void            *args,
849 	tbd_ops_t       tbd_funcs,
850 	vm_offset_t     int_address,
851 	vm_offset_t     int_value __unused)
852 {
853 	cpu_data_t     *cpu_data_ptr;
854 
855 	cpu_data_ptr = (cpu_data_t *)args;
856 
857 	if ((cpu_data_ptr == &BootCpuData)
858 	    && (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
859 		rtclock_timebase_func = *tbd_funcs;
860 		rtclock_timebase_addr = int_address;
861 	}
862 }
863 
864 #define ML_READPROP_MANDATORY UINT64_MAX
865 
866 static uint64_t
ml_readprop(const DTEntry entry,const char * propertyName,uint64_t default_value)867 ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
868 {
869 	void const *prop;
870 	unsigned int propSize;
871 
872 	if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) {
873 		if (propSize == sizeof(uint8_t)) {
874 			return *((uint8_t const *)prop);
875 		} else if (propSize == sizeof(uint16_t)) {
876 			return *((uint16_t const *)prop);
877 		} else if (propSize == sizeof(uint32_t)) {
878 			return *((uint32_t const *)prop);
879 		} else if (propSize == sizeof(uint64_t)) {
880 			return *((uint64_t const *)prop);
881 		} else {
882 			panic("CPU property '%s' has bad size %u", propertyName, propSize);
883 		}
884 	} else {
885 		if (default_value == ML_READPROP_MANDATORY) {
886 			panic("Missing mandatory property '%s'", propertyName);
887 		}
888 		return default_value;
889 	}
890 }
891 
892 static boolean_t
ml_read_reg_range(const DTEntry entry,const char * propertyName,uint64_t * pa_ptr,uint64_t * len_ptr)893 ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
894 {
895 	uint64_t const *prop;
896 	unsigned int propSize;
897 
898 	if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) {
899 		return FALSE;
900 	}
901 
902 	if (propSize != sizeof(uint64_t) * 2) {
903 		panic("Wrong property size for %s", propertyName);
904 	}
905 
906 	*pa_ptr = prop[0];
907 	*len_ptr = prop[1];
908 	return TRUE;
909 }
910 
911 static boolean_t
ml_is_boot_cpu(const DTEntry entry)912 ml_is_boot_cpu(const DTEntry entry)
913 {
914 	void const *prop;
915 	unsigned int propSize;
916 
917 	if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) {
918 		panic("unable to retrieve state for cpu");
919 	}
920 
921 	if (strncmp((char const *)prop, "running", propSize) == 0) {
922 		return TRUE;
923 	} else {
924 		return FALSE;
925 	}
926 }
927 
928 static void
ml_read_chip_revision(unsigned int * rev __unused)929 ml_read_chip_revision(unsigned int *rev __unused)
930 {
931 	// The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
932 #ifdef APPLE_ARM64_ARCH_FAMILY
933 	DTEntry         entryP;
934 
935 	if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) {
936 		*rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN);
937 	} else {
938 		*rev = CPU_VERSION_UNKNOWN;
939 	}
940 #endif
941 }
942 
943 void
ml_parse_cpu_topology(void)944 ml_parse_cpu_topology(void)
945 {
946 	DTEntry entry, child __unused;
947 	OpaqueDTEntryIterator iter;
948 	uint32_t cpu_boot_arg = MAX_CPUS;
949 	uint64_t cpumask_boot_arg = ULLONG_MAX;
950 	int err;
951 
952 	int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
953 	int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
954 	const boolean_t cpus_boot_arg_present = PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg));
955 	const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn("cpumask", &cpumask_boot_arg, sizeof(cpumask_boot_arg));
956 
957 	// The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
958 	// so that we trigger a panic later in the boot process, once serial is enabled.
959 	if (cpus_boot_arg_present && cpumask_boot_arg_present) {
960 		cpu_config_correct = false;
961 	}
962 
963 	err = SecureDTLookupEntry(NULL, "/cpus", &entry);
964 	assert(err == kSuccess);
965 
966 	err = SecureDTInitEntryIterator(entry, &iter);
967 	assert(err == kSuccess);
968 
969 	for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
970 		cluster_offsets[i] = -1;
971 		cluster_phys_to_logical[i] = -1;
972 		cluster_max_cpu_phys_id[i] = 0;
973 	}
974 
975 	while (kSuccess == SecureDTIterateEntries(&iter, &child)) {
976 		boolean_t is_boot_cpu = ml_is_boot_cpu(child);
977 		boolean_t cpu_enabled = cpumask_boot_arg & 1;
978 		cpumask_boot_arg >>= 1;
979 
980 		// Boot CPU disabled in cpumask. Flag this so that we trigger a panic
981 		// later in the boot process, once serial is enabled.
982 		if (is_boot_cpu && !cpu_enabled) {
983 			cpu_config_correct = false;
984 		}
985 
986 		// Ignore this CPU if it has been disabled by the cpumask= boot-arg.
987 		if (!is_boot_cpu && !cpu_enabled) {
988 			continue;
989 		}
990 
991 		// If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
992 		// been added to the topology struct yet, and we only have one slot left, then skip
993 		// every other non-boot CPU in order to leave room for the boot CPU.
994 		//
995 		// e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
996 		// array will list CPU0, CPU1, and CPU4.  CPU2-CPU3 and CPU5-CPUn will be omitted.
997 		if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
998 			continue;
999 		}
1000 		if (topology_info.num_cpus >= cpu_boot_arg) {
1001 			break;
1002 		}
1003 
1004 		ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
1005 
1006 		cpu->cpu_id = topology_info.num_cpus++;
1007 		assert(cpu->cpu_id < MAX_CPUS);
1008 		topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
1009 
1010 		cpu->die_id = 0;
1011 		topology_info.max_die_id = 0;
1012 
1013 		cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY);
1014 
1015 		cpu->l2_access_penalty = (uint32_t)ml_readprop(child, "l2-access-penalty", 0);
1016 		cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0);
1017 		cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0);
1018 		cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
1019 		cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
1020 
1021 		ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
1022 		ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
1023 		ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
1024 		cpu->cluster_type = CLUSTER_TYPE_SMP;
1025 
1026 		int cluster_type = (int)ml_readprop(child, "cluster-type", 0);
1027 		if (cluster_type == 'E') {
1028 			cpu->cluster_type = CLUSTER_TYPE_E;
1029 		} else if (cluster_type == 'P') {
1030 			cpu->cluster_type = CLUSTER_TYPE_P;
1031 		}
1032 
1033 		topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1034 
1035 		/*
1036 		 * Since we want to keep a linear cluster ID space, we cannot just rely
1037 		 * on the value provided by EDT. Instead, use the MPIDR value to see if we have
1038 		 * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1039 		 */
1040 #if HAS_CLUSTER
1041 		uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1042 #else
1043 		uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1044 #endif
1045 		assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1046 		cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
1047 		    topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1048 
1049 		assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1050 
1051 		ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1052 		if (cluster->num_cpus == 0) {
1053 			assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1054 
1055 			topology_info.num_clusters++;
1056 			topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1057 			topology_info.cluster_types |= (1 << cpu->cluster_type);
1058 
1059 			cluster->cluster_id = cpu->cluster_id;
1060 			cluster->cluster_type = cpu->cluster_type;
1061 			cluster->first_cpu_id = cpu->cpu_id;
1062 			assert(cluster_phys_to_logical[phys_cluster_id] == -1);
1063 			cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1064 
1065 			topology_info.cluster_type_num_clusters[cluster->cluster_type]++;
1066 
1067 			// Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1068 			// If we wind up with a bunch of these, we might want to create separate per-cluster
1069 			// EDT nodes and have the CPU nodes reference them through a phandle.
1070 			ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len);
1071 			ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len);
1072 		}
1073 
1074 #if HAS_CLUSTER
1075 		if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1076 			cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1077 		}
1078 #endif
1079 
1080 		cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1081 		cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1082 
1083 		cluster->num_cpus++;
1084 		cluster->cpu_mask |= 1ULL << cpu->cpu_id;
1085 
1086 		if (is_boot_cpu) {
1087 			assert(topology_info.boot_cpu == NULL);
1088 			topology_info.boot_cpu = cpu;
1089 			topology_info.boot_cluster = cluster;
1090 		}
1091 
1092 	}
1093 
1094 #if HAS_CLUSTER
1095 	/*
1096 	 * Build the cluster offset array, ensuring that the region reserved
1097 	 * for each physical cluster contains enough entries to be indexed
1098 	 * by the maximum physical CPU ID (AFF0) within the cluster.
1099 	 */
1100 	unsigned int cur_cluster_offset = 0;
1101 	for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1102 		if (cluster_phys_to_logical[i] != -1) {
1103 			cluster_offsets[i] = cur_cluster_offset;
1104 			cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
1105 		}
1106 	}
1107 	assert(cur_cluster_offset <= MAX_CPUS);
1108 #else
1109 	/*
1110 	 * For H10, there are really 2 physical clusters, but they are not separated
1111 	 * into distinct ACCs.  AFF1 therefore always reports 0, and AFF0 numbering
1112 	 * is linear across both clusters.   For the purpose of MPIDR_EL1-based indexing,
1113 	 * treat H10 and earlier devices as though they contain a single cluster.
1114 	 */
1115 	cluster_offsets[0] = 0;
1116 #endif
1117 	assert(topology_info.boot_cpu != NULL);
1118 	ml_read_chip_revision(&topology_info.chip_revision);
1119 
1120 	/*
1121 	 * Set TPIDR_EL0 to indicate the correct cpu number, as we may
1122 	 * not be booting from cpu 0.  Userspace will consume the current
1123 	 * CPU number through this register.  For non-boot cores, this is
1124 	 * done in start.s (start_cpu) using the cpu_number field of the
1125 	 * per-cpu data object.
1126 	 */
1127 	uint64_t cpuid = topology_info.boot_cpu->cpu_id;
1128 
1129 	__builtin_arm_wsr64("TPIDR_EL0", cpuid & MACHDEP_TPIDR_CPUNUM_MASK);
1130 	assert((cpuid & MACHDEP_TPIDR_CPUNUM_MASK) == cpuid);
1131 	__builtin_arm_wsr64("TPIDRRO_EL0", 0);
1132 }
1133 
1134 const ml_topology_info_t *
ml_get_topology_info(void)1135 ml_get_topology_info(void)
1136 {
1137 	return &topology_info;
1138 }
1139 
1140 void
ml_map_cpu_pio(void)1141 ml_map_cpu_pio(void)
1142 {
1143 	unsigned int i;
1144 
1145 	for (i = 0; i < topology_info.num_cpus; i++) {
1146 		ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1147 		if (cpu->cpu_IMPL_pa) {
1148 			cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len);
1149 			cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len);
1150 		}
1151 		if (cpu->cpu_UTTDBG_pa) {
1152 			cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len);
1153 		}
1154 	}
1155 
1156 	for (i = 0; i < topology_info.num_clusters; i++) {
1157 		ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1158 		if (cluster->acc_IMPL_pa) {
1159 			cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len);
1160 		}
1161 		if (cluster->cpm_IMPL_pa) {
1162 			cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len);
1163 		}
1164 	}
1165 }
1166 
1167 unsigned int
ml_get_cpu_count(void)1168 ml_get_cpu_count(void)
1169 {
1170 	return topology_info.num_cpus;
1171 }
1172 
1173 unsigned int
ml_get_cluster_count(void)1174 ml_get_cluster_count(void)
1175 {
1176 	return topology_info.num_clusters;
1177 }
1178 
1179 int
ml_get_boot_cpu_number(void)1180 ml_get_boot_cpu_number(void)
1181 {
1182 	return topology_info.boot_cpu->cpu_id;
1183 }
1184 
1185 cluster_type_t
ml_get_boot_cluster_type(void)1186 ml_get_boot_cluster_type(void)
1187 {
1188 	return topology_info.boot_cluster->cluster_type;
1189 }
1190 
1191 int
ml_get_cpu_number(uint32_t phys_id)1192 ml_get_cpu_number(uint32_t phys_id)
1193 {
1194 	phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
1195 
1196 	for (unsigned i = 0; i < topology_info.num_cpus; i++) {
1197 		if (topology_info.cpus[i].phys_id == phys_id) {
1198 			return i;
1199 		}
1200 	}
1201 
1202 	return -1;
1203 }
1204 
1205 int
ml_get_cluster_number(uint32_t phys_id)1206 ml_get_cluster_number(uint32_t phys_id)
1207 {
1208 	int cpu_id = ml_get_cpu_number(phys_id);
1209 	if (cpu_id < 0) {
1210 		return -1;
1211 	}
1212 
1213 	ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1214 
1215 	return cpu->cluster_id;
1216 }
1217 
1218 unsigned int
ml_get_cpu_number_local(void)1219 ml_get_cpu_number_local(void)
1220 {
1221 	uint64_t mpidr_el1_value = 0;
1222 	unsigned cpu_id;
1223 
1224 	/* We identify the CPU based on the constant bits of MPIDR_EL1. */
1225 	MRS(mpidr_el1_value, "MPIDR_EL1");
1226 	cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value);
1227 
1228 	assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1229 
1230 	return cpu_id;
1231 }
1232 
1233 int
ml_get_cluster_number_local()1234 ml_get_cluster_number_local()
1235 {
1236 	uint64_t mpidr_el1_value = 0;
1237 	unsigned cluster_id;
1238 
1239 	/* We identify the cluster based on the constant bits of MPIDR_EL1. */
1240 	MRS(mpidr_el1_value, "MPIDR_EL1");
1241 	cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value);
1242 
1243 	assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1244 
1245 	return cluster_id;
1246 }
1247 
1248 int
ml_get_max_cpu_number(void)1249 ml_get_max_cpu_number(void)
1250 {
1251 	return topology_info.max_cpu_id;
1252 }
1253 
1254 int
ml_get_max_cluster_number(void)1255 ml_get_max_cluster_number(void)
1256 {
1257 	return topology_info.max_cluster_id;
1258 }
1259 
1260 unsigned int
ml_get_first_cpu_id(unsigned int cluster_id)1261 ml_get_first_cpu_id(unsigned int cluster_id)
1262 {
1263 	return topology_info.clusters[cluster_id].first_cpu_id;
1264 }
1265 
1266 /*
1267  * Return the die id of a cluster.
1268  */
1269 unsigned int
ml_get_die_id(unsigned int cluster_id)1270 ml_get_die_id(unsigned int cluster_id)
1271 {
1272 	/*
1273 	 * The current implementation gets the die_id from the
1274 	 * first CPU of the cluster.
1275 	 * rdar://80917654 (Add the die_id field to the cluster topology info)
1276 	 */
1277 	unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1278 	return topology_info.cpus[first_cpu].die_id;
1279 }
1280 
1281 /*
1282  * Return the index of a cluster in its die.
1283  */
1284 unsigned int
ml_get_die_cluster_id(unsigned int cluster_id)1285 ml_get_die_cluster_id(unsigned int cluster_id)
1286 {
1287 	/*
1288 	 * The current implementation gets the die_id from the
1289 	 * first CPU of the cluster.
1290 	 * rdar://80917654 (Add the die_id field to the cluster topology info)
1291 	 */
1292 	unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1293 	return topology_info.cpus[first_cpu].die_cluster_id;
1294 }
1295 
1296 /*
1297  * Return the highest die id of the system.
1298  */
1299 unsigned int
ml_get_max_die_id(void)1300 ml_get_max_die_id(void)
1301 {
1302 	return topology_info.max_die_id;
1303 }
1304 
1305 void
ml_lockdown_init()1306 ml_lockdown_init()
1307 {
1308 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
1309 	rorgn_stash_range();
1310 #endif
1311 }
1312 
1313 kern_return_t
ml_lockdown_handler_register(lockdown_handler_t f,void * this)1314 ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1315 {
1316 	if (!f) {
1317 		return KERN_FAILURE;
1318 	}
1319 
1320 	assert(lockdown_done);
1321 	f(this); // XXX: f this whole function
1322 
1323 	return KERN_SUCCESS;
1324 }
1325 
1326 static mcache_flush_function mcache_flush_func;
1327 static void* mcache_flush_service;
1328 kern_return_t
ml_mcache_flush_callback_register(mcache_flush_function func,void * service)1329 ml_mcache_flush_callback_register(mcache_flush_function func, void *service)
1330 {
1331 	mcache_flush_service = service;
1332 	mcache_flush_func = func;
1333 
1334 	return KERN_SUCCESS;
1335 }
1336 
1337 kern_return_t
ml_mcache_flush(void)1338 ml_mcache_flush(void)
1339 {
1340 	if (!mcache_flush_func) {
1341 		panic("Cannot flush M$ with no flush callback registered");
1342 
1343 		return KERN_FAILURE;
1344 	} else {
1345 		return mcache_flush_func(mcache_flush_service);
1346 	}
1347 }
1348 
1349 
1350 extern lck_mtx_t pset_create_lock;
1351 
1352 kern_return_t
ml_processor_register(ml_processor_info_t * in_processor_info,processor_t * processor_out,ipi_handler_t * ipi_handler_out,perfmon_interrupt_handler_func * pmi_handler_out)1353 ml_processor_register(ml_processor_info_t *in_processor_info,
1354     processor_t *processor_out, ipi_handler_t *ipi_handler_out,
1355     perfmon_interrupt_handler_func *pmi_handler_out)
1356 {
1357 	cpu_data_t *this_cpu_datap;
1358 	processor_set_t pset;
1359 	boolean_t  is_boot_cpu;
1360 	static unsigned int reg_cpu_count = 0;
1361 
1362 	if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1363 		return KERN_FAILURE;
1364 	}
1365 
1366 	if ((unsigned)OSIncrementAtomic((SInt32*)&reg_cpu_count) >= topology_info.num_cpus) {
1367 		return KERN_FAILURE;
1368 	}
1369 
1370 	if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1371 		is_boot_cpu = FALSE;
1372 		this_cpu_datap = cpu_data_alloc(FALSE);
1373 		cpu_data_init(this_cpu_datap);
1374 	} else {
1375 		this_cpu_datap = &BootCpuData;
1376 		is_boot_cpu = TRUE;
1377 	}
1378 
1379 	assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1380 
1381 	this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1382 
1383 	if (!is_boot_cpu) {
1384 		this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1385 
1386 		if (cpu_data_register(this_cpu_datap) != KERN_SUCCESS) {
1387 			goto processor_register_error;
1388 		}
1389 		assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1390 	}
1391 
1392 	this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1393 	this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1394 	nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency);
1395 	this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr);
1396 
1397 	this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1398 	this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1399 
1400 	this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1401 	this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1402 	this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1403 	this_cpu_datap->cpu_l2_access_penalty = in_processor_info->l2_access_penalty;
1404 
1405 	this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1406 	this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1407 	this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1408 	this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1409 	this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1410 	this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1411 
1412 #if HAS_CLUSTER
1413 	this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1414 #else /* HAS_CLUSTER */
1415 	this_cpu_datap->cluster_master = is_boot_cpu;
1416 #endif /* HAS_CLUSTER */
1417 	lck_mtx_lock(&pset_create_lock);
1418 	pset = pset_find(in_processor_info->cluster_id, NULL);
1419 	kprintf("[%d]%s>pset_find(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cluster_id, pset ? pset->pset_id : -1);
1420 	if (pset == NULL) {
1421 #if __AMP__
1422 		pset_cluster_type_t pset_cluster_type = this_cpu_datap->cpu_cluster_type == CLUSTER_TYPE_E ? PSET_AMP_E : PSET_AMP_P;
1423 		pset = pset_create(ml_get_boot_cluster_type() == this_cpu_datap->cpu_cluster_type ? &pset_node0 : &pset_node1, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1424 		assert(pset != PROCESSOR_SET_NULL);
1425 		kprintf("[%d]%s>pset_create(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, this_cpu_datap->cpu_cluster_id, pset->pset_id);
1426 #else /* __AMP__ */
1427 		pset_cluster_type_t pset_cluster_type = PSET_SMP;
1428 		pset = pset_create(&pset_node0, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1429 		assert(pset != PROCESSOR_SET_NULL);
1430 #endif /* __AMP__ */
1431 	}
1432 	kprintf("[%d]%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1433 	lck_mtx_unlock(&pset_create_lock);
1434 
1435 	processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1436 	if (!is_boot_cpu) {
1437 		processor_init(processor, this_cpu_datap->cpu_number, pset);
1438 
1439 		if (this_cpu_datap->cpu_l2_access_penalty) {
1440 			/*
1441 			 * Cores that have a non-zero L2 access penalty compared
1442 			 * to the boot processor should be de-prioritized by the
1443 			 * scheduler, so that threads use the cores with better L2
1444 			 * preferentially.
1445 			 */
1446 			processor_set_primary(processor, master_processor);
1447 		}
1448 	}
1449 
1450 	*processor_out = processor;
1451 	*ipi_handler_out = cpu_signal_handler;
1452 #if CPMU_AIC_PMI && MONOTONIC
1453 	*pmi_handler_out = mt_cpmu_aic_pmi;
1454 #else
1455 	*pmi_handler_out = NULL;
1456 #endif /* CPMU_AIC_PMI && MONOTONIC */
1457 	if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1458 		*in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1459 	}
1460 
1461 #if KPC
1462 	if (kpc_register_cpu(this_cpu_datap) != TRUE) {
1463 		goto processor_register_error;
1464 	}
1465 #endif /* KPC */
1466 
1467 	if (!is_boot_cpu) {
1468 		random_cpu_init(this_cpu_datap->cpu_number);
1469 		// now let next CPU register itself
1470 		OSIncrementAtomic((SInt32*)&real_ncpus);
1471 	}
1472 
1473 	return KERN_SUCCESS;
1474 
1475 processor_register_error:
1476 #if KPC
1477 	kpc_unregister_cpu(this_cpu_datap);
1478 #endif /* KPC */
1479 	if (!is_boot_cpu) {
1480 		cpu_data_free(this_cpu_datap);
1481 	}
1482 
1483 	return KERN_FAILURE;
1484 }
1485 
1486 void
ml_init_arm_debug_interface(void * in_cpu_datap,vm_offset_t virt_address)1487 ml_init_arm_debug_interface(
1488 	void * in_cpu_datap,
1489 	vm_offset_t virt_address)
1490 {
1491 	((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1492 	do_debugid();
1493 }
1494 
1495 /*
1496  *	Routine:        init_ast_check
1497  *	Function:
1498  */
1499 void
init_ast_check(__unused processor_t processor)1500 init_ast_check(
1501 	__unused processor_t processor)
1502 {
1503 }
1504 
1505 /*
1506  *	Routine:        cause_ast_check
1507  *	Function:
1508  */
1509 void
cause_ast_check(processor_t processor)1510 cause_ast_check(
1511 	processor_t processor)
1512 {
1513 	if (current_processor() != processor) {
1514 		cpu_signal(processor_to_cpu_datap(processor), SIGPast, (void *)NULL, (void *)NULL);
1515 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 1 /* ast */, 0, 0, 0);
1516 	}
1517 }
1518 
1519 extern uint32_t cpu_idle_count;
1520 
1521 void
ml_get_power_state(boolean_t * icp,boolean_t * pidlep)1522 ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
1523 {
1524 	*icp = ml_at_interrupt_context();
1525 	*pidlep = (cpu_idle_count == real_ncpus);
1526 }
1527 
1528 /*
1529  *	Routine:        ml_cause_interrupt
1530  *	Function:	Generate a fake interrupt
1531  */
1532 void
ml_cause_interrupt(void)1533 ml_cause_interrupt(void)
1534 {
1535 	return;                 /* BS_XXX */
1536 }
1537 
1538 /* Map memory map IO space */
1539 vm_offset_t
ml_io_map(vm_offset_t phys_addr,vm_size_t size)1540 ml_io_map(
1541 	vm_offset_t phys_addr,
1542 	vm_size_t size)
1543 {
1544 	return io_map(phys_addr, size, VM_WIMG_IO, VM_PROT_DEFAULT, false);
1545 }
1546 
1547 /* Map memory map IO space (with protections specified) */
1548 vm_offset_t
ml_io_map_with_prot(vm_offset_t phys_addr,vm_size_t size,vm_prot_t prot)1549 ml_io_map_with_prot(
1550 	vm_offset_t phys_addr,
1551 	vm_size_t size,
1552 	vm_prot_t prot)
1553 {
1554 	return io_map(phys_addr, size, VM_WIMG_IO, prot, false);
1555 }
1556 
1557 vm_offset_t
ml_io_map_unmappable(vm_offset_t phys_addr,vm_size_t size,unsigned int flags)1558 ml_io_map_unmappable(
1559 	vm_offset_t             phys_addr,
1560 	vm_size_t               size,
1561 	unsigned int            flags)
1562 {
1563 	return io_map(phys_addr, size, flags, VM_PROT_DEFAULT, true);
1564 }
1565 
1566 vm_offset_t
ml_io_map_wcomb(vm_offset_t phys_addr,vm_size_t size)1567 ml_io_map_wcomb(
1568 	vm_offset_t phys_addr,
1569 	vm_size_t size)
1570 {
1571 	return io_map(phys_addr, size, VM_WIMG_WCOMB, VM_PROT_DEFAULT, false);
1572 }
1573 
1574 void
ml_io_unmap(vm_offset_t addr,vm_size_t sz)1575 ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1576 {
1577 	pmap_remove(kernel_pmap, addr, addr + sz);
1578 	kmem_free(kernel_map, addr, sz);
1579 }
1580 
1581 vm_map_address_t
ml_map_high_window(vm_offset_t phys_addr,vm_size_t len)1582 ml_map_high_window(
1583 	vm_offset_t     phys_addr,
1584 	vm_size_t       len)
1585 {
1586 	return pmap_map_high_window_bd(phys_addr, len, VM_PROT_READ | VM_PROT_WRITE);
1587 }
1588 
1589 vm_offset_t
ml_static_ptovirt(vm_offset_t paddr)1590 ml_static_ptovirt(
1591 	vm_offset_t paddr)
1592 {
1593 	return phystokv(paddr);
1594 }
1595 
1596 vm_offset_t
ml_static_slide(vm_offset_t vaddr)1597 ml_static_slide(
1598 	vm_offset_t vaddr)
1599 {
1600 	vm_offset_t slid_vaddr = 0;
1601 
1602 	{
1603 		slid_vaddr = vaddr + vm_kernel_slide;
1604 	}
1605 
1606 	if (!VM_KERNEL_IS_SLID(slid_vaddr)) {
1607 		/* This is only intended for use on static kernel addresses. */
1608 		return 0;
1609 	}
1610 
1611 	return slid_vaddr;
1612 }
1613 
1614 vm_offset_t
ml_static_unslide(vm_offset_t vaddr)1615 ml_static_unslide(
1616 	vm_offset_t vaddr)
1617 {
1618 	if (!VM_KERNEL_IS_SLID(vaddr)) {
1619 		/* This is only intended for use on static kernel addresses. */
1620 		return 0;
1621 	}
1622 
1623 
1624 	return vaddr - vm_kernel_slide;
1625 }
1626 
1627 extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1628 
1629 kern_return_t
ml_static_protect(vm_offset_t vaddr,vm_size_t size,vm_prot_t new_prot __unused)1630 ml_static_protect(
1631 	vm_offset_t vaddr, /* kernel virtual address */
1632 	vm_size_t size,
1633 	vm_prot_t new_prot __unused)
1634 {
1635 	pt_entry_t    arm_prot = 0;
1636 	pt_entry_t    arm_block_prot = 0;
1637 	vm_offset_t   vaddr_cur;
1638 	ppnum_t       ppn;
1639 	kern_return_t result = KERN_SUCCESS;
1640 
1641 	if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1642 		panic("ml_static_protect(): %p < %p", (void *) vaddr, (void *) VM_MIN_KERNEL_ADDRESS);
1643 		return KERN_FAILURE;
1644 	}
1645 
1646 	assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1647 
1648 	if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1649 		panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1650 	}
1651 	if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1652 		panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1653 	}
1654 
1655 	/* Set up the protection bits, and block bits so we can validate block mappings. */
1656 	if (new_prot & VM_PROT_WRITE) {
1657 		arm_prot |= ARM_PTE_AP(AP_RWNA);
1658 		arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RWNA);
1659 	} else {
1660 		arm_prot |= ARM_PTE_AP(AP_RONA);
1661 		arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RONA);
1662 	}
1663 
1664 	arm_prot |= ARM_PTE_NX;
1665 	arm_block_prot |= ARM_TTE_BLOCK_NX;
1666 
1667 	if (!(new_prot & VM_PROT_EXECUTE)) {
1668 		arm_prot |= ARM_PTE_PNX;
1669 		arm_block_prot |= ARM_TTE_BLOCK_PNX;
1670 	}
1671 
1672 	for (vaddr_cur = vaddr;
1673 	    vaddr_cur < trunc_page_64(vaddr + size);
1674 	    vaddr_cur += PAGE_SIZE) {
1675 		ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1676 		if (ppn != (vm_offset_t) NULL) {
1677 			tt_entry_t      *tte2;
1678 			pt_entry_t      *pte_p;
1679 			pt_entry_t      ptmp;
1680 
1681 #if XNU_MONITOR
1682 			assert(!pmap_is_monitor(ppn));
1683 			assert(!TEST_PAGE_RATIO_4);
1684 #endif
1685 
1686 			tte2 = arm_kva_to_tte(vaddr_cur);
1687 
1688 			if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1689 				if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1690 				    ((*tte2 & (ARM_TTE_BLOCK_NXMASK | ARM_TTE_BLOCK_PNXMASK | ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1691 					/*
1692 					 * We can support ml_static_protect on a block mapping if the mapping already has
1693 					 * the desired protections.  We still want to run checks on a per-page basis.
1694 					 */
1695 					continue;
1696 				}
1697 
1698 				result = KERN_FAILURE;
1699 				break;
1700 			}
1701 
1702 			pte_p = (pt_entry_t *)&((tt_entry_t*)(phystokv((*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1703 			ptmp = *pte_p;
1704 
1705 			if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot)) {
1706 				/*
1707 				 * The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
1708 				 * protections do not match the desired protections, then we will fail (as we cannot update
1709 				 * this mapping without updating other mappings as well).
1710 				 */
1711 				result = KERN_FAILURE;
1712 				break;
1713 			}
1714 
1715 			__unreachable_ok_push
1716 			if (TEST_PAGE_RATIO_4) {
1717 				{
1718 					unsigned int    i;
1719 					pt_entry_t      *ptep_iter;
1720 
1721 					ptep_iter = pte_p;
1722 					for (i = 0; i < 4; i++, ptep_iter++) {
1723 						/* Note that there is a hole in the HINT sanity checking here. */
1724 						ptmp = *ptep_iter;
1725 
1726 						/* We only need to update the page tables if the protections do not match. */
1727 						if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1728 							ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1729 							*ptep_iter = ptmp;
1730 						}
1731 					}
1732 				}
1733 			} else {
1734 				ptmp = *pte_p;
1735 				/* We only need to update the page tables if the protections do not match. */
1736 				if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1737 					ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1738 					*pte_p = ptmp;
1739 				}
1740 			}
1741 			__unreachable_ok_pop
1742 		}
1743 	}
1744 
1745 	if (vaddr_cur > vaddr) {
1746 		assert(((vaddr_cur - vaddr) & 0xFFFFFFFF00000000ULL) == 0);
1747 		flush_mmu_tlb_region(vaddr, (uint32_t)(vaddr_cur - vaddr));
1748 	}
1749 
1750 
1751 	return result;
1752 }
1753 
1754 
1755 /*
1756  *	Routine:        ml_static_mfree
1757  *	Function:
1758  */
1759 void
ml_static_mfree(vm_offset_t vaddr,vm_size_t size)1760 ml_static_mfree(
1761 	vm_offset_t vaddr,
1762 	vm_size_t   size)
1763 {
1764 	vm_offset_t vaddr_cur;
1765 	vm_offset_t paddr_cur;
1766 	ppnum_t     ppn;
1767 	uint32_t    freed_pages = 0;
1768 	uint32_t    freed_kernelcache_pages = 0;
1769 
1770 
1771 	/* It is acceptable (if bad) to fail to free. */
1772 	if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1773 		return;
1774 	}
1775 
1776 	assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1777 
1778 	for (vaddr_cur = vaddr;
1779 	    vaddr_cur < trunc_page_64(vaddr + size);
1780 	    vaddr_cur += PAGE_SIZE) {
1781 		ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1782 		if (ppn != (vm_offset_t) NULL) {
1783 			/*
1784 			 * It is not acceptable to fail to update the protections on a page
1785 			 * we will release to the VM.  We need to either panic or continue.
1786 			 * For now, we'll panic (to help flag if there is memory we can
1787 			 * reclaim).
1788 			 */
1789 			if (ml_static_protect(vaddr_cur, PAGE_SIZE, VM_PROT_WRITE | VM_PROT_READ) != KERN_SUCCESS) {
1790 				panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
1791 			}
1792 
1793 			paddr_cur = ptoa(ppn);
1794 
1795 
1796 			vm_page_create(ppn, (ppn + 1));
1797 			freed_pages++;
1798 			if (paddr_cur >= arm_vm_kernelcache_phys_start && paddr_cur < arm_vm_kernelcache_phys_end) {
1799 				freed_kernelcache_pages++;
1800 			}
1801 		}
1802 	}
1803 	vm_page_lockspin_queues();
1804 	vm_page_wire_count -= freed_pages;
1805 	vm_page_wire_count_initial -= freed_pages;
1806 	vm_page_kernelcache_count -= freed_kernelcache_pages;
1807 	vm_page_unlock_queues();
1808 #if     DEBUG
1809 	kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
1810 #endif
1811 }
1812 
1813 /*
1814  * Routine: ml_page_protection_type
1815  * Function: Returns the type of page protection that the system supports.
1816  */
1817 ml_page_protection_t
ml_page_protection_type(void)1818 ml_page_protection_type(void)
1819 {
1820 #if   XNU_MONITOR
1821 	return 1;
1822 #else
1823 	return 0;
1824 #endif
1825 }
1826 
1827 /* virtual to physical on wired pages */
1828 vm_offset_t
ml_vtophys(vm_offset_t vaddr)1829 ml_vtophys(vm_offset_t vaddr)
1830 {
1831 	return kvtophys(vaddr);
1832 }
1833 
1834 /*
1835  * Routine: ml_nofault_copy
1836  * Function: Perform a physical mode copy if the source and destination have
1837  * valid translations in the kernel pmap. If translations are present, they are
1838  * assumed to be wired; e.g., no attempt is made to guarantee that the
1839  * translations obtained remain valid for the duration of the copy process.
1840  */
1841 vm_size_t
ml_nofault_copy(vm_offset_t virtsrc,vm_offset_t virtdst,vm_size_t size)1842 ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
1843 {
1844 	addr64_t        cur_phys_dst, cur_phys_src;
1845 	vm_size_t       count, nbytes = 0;
1846 
1847 	while (size > 0) {
1848 		if (!(cur_phys_src = kvtophys(virtsrc))) {
1849 			break;
1850 		}
1851 		if (!(cur_phys_dst = kvtophys(virtdst))) {
1852 			break;
1853 		}
1854 		if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) ||
1855 		    !pmap_valid_address(trunc_page_64(cur_phys_src))) {
1856 			break;
1857 		}
1858 		count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
1859 		if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
1860 			count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
1861 		}
1862 		if (count > size) {
1863 			count = size;
1864 		}
1865 
1866 		bcopy_phys(cur_phys_src, cur_phys_dst, count);
1867 
1868 		nbytes += count;
1869 		virtsrc += count;
1870 		virtdst += count;
1871 		size -= count;
1872 	}
1873 
1874 	return nbytes;
1875 }
1876 
1877 /*
1878  *	Routine:        ml_validate_nofault
1879  *	Function: Validate that ths address range has a valid translations
1880  *			in the kernel pmap.  If translations are present, they are
1881  *			assumed to be wired; i.e. no attempt is made to guarantee
1882  *			that the translation persist after the check.
1883  *  Returns: TRUE if the range is mapped and will not cause a fault,
1884  *			FALSE otherwise.
1885  */
1886 
1887 boolean_t
ml_validate_nofault(vm_offset_t virtsrc,vm_size_t size)1888 ml_validate_nofault(
1889 	vm_offset_t virtsrc, vm_size_t size)
1890 {
1891 	addr64_t cur_phys_src;
1892 	uint32_t count;
1893 
1894 	while (size > 0) {
1895 		if (!(cur_phys_src = kvtophys(virtsrc))) {
1896 			return FALSE;
1897 		}
1898 		if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
1899 			return FALSE;
1900 		}
1901 		count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
1902 		if (count > size) {
1903 			count = (uint32_t)size;
1904 		}
1905 
1906 		virtsrc += count;
1907 		size -= count;
1908 	}
1909 
1910 	return TRUE;
1911 }
1912 
1913 void
ml_get_bouncepool_info(vm_offset_t * phys_addr,vm_size_t * size)1914 ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
1915 {
1916 	*phys_addr = 0;
1917 	*size = 0;
1918 }
1919 
1920 void
active_rt_threads(__unused boolean_t active)1921 active_rt_threads(__unused boolean_t active)
1922 {
1923 }
1924 
1925 static void
cpu_qos_cb_default(__unused int urgency,__unused uint64_t qos_param1,__unused uint64_t qos_param2)1926 cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
1927 {
1928 	return;
1929 }
1930 
1931 cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
1932 
1933 void
cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)1934 cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
1935 {
1936 	if (cpu_qos_cb != NULL) {
1937 		cpu_qos_update = cpu_qos_cb;
1938 	} else {
1939 		cpu_qos_update = cpu_qos_cb_default;
1940 	}
1941 }
1942 
1943 void
thread_tell_urgency(thread_urgency_t urgency,uint64_t rt_period,uint64_t rt_deadline,uint64_t sched_latency __unused,__unused thread_t nthread)1944 thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
1945 {
1946 	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
1947 
1948 	cpu_qos_update((int)urgency, rt_period, rt_deadline);
1949 
1950 	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
1951 }
1952 
1953 void
machine_run_count(__unused uint32_t count)1954 machine_run_count(__unused uint32_t count)
1955 {
1956 }
1957 
1958 processor_t
machine_choose_processor(__unused processor_set_t pset,processor_t processor)1959 machine_choose_processor(__unused processor_set_t pset, processor_t processor)
1960 {
1961 	return processor;
1962 }
1963 
1964 #if KASAN
1965 vm_offset_t ml_stack_base(void);
1966 vm_size_t ml_stack_size(void);
1967 
1968 vm_offset_t
ml_stack_base(void)1969 ml_stack_base(void)
1970 {
1971 	uintptr_t local = (uintptr_t) &local;
1972 	vm_offset_t     intstack_top_ptr;
1973 
1974 	intstack_top_ptr = getCpuDatap()->intstack_top;
1975 	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1976 		return intstack_top_ptr - INTSTACK_SIZE;
1977 	} else {
1978 		return current_thread()->kernel_stack;
1979 	}
1980 }
1981 vm_size_t
ml_stack_size(void)1982 ml_stack_size(void)
1983 {
1984 	uintptr_t local = (uintptr_t) &local;
1985 	vm_offset_t     intstack_top_ptr;
1986 
1987 	intstack_top_ptr = getCpuDatap()->intstack_top;
1988 	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1989 		return INTSTACK_SIZE;
1990 	} else {
1991 		return kernel_stack_size;
1992 	}
1993 }
1994 #endif
1995 
1996 #ifdef CONFIG_KCOV
1997 
1998 kcov_cpu_data_t *
current_kcov_data(void)1999 current_kcov_data(void)
2000 {
2001 	return &current_cpu_datap()->cpu_kcov_data;
2002 }
2003 
2004 kcov_cpu_data_t *
cpu_kcov_data(int cpuid)2005 cpu_kcov_data(int cpuid)
2006 {
2007 	return &cpu_datap(cpuid)->cpu_kcov_data;
2008 }
2009 
2010 #endif /* CONFIG_KCOV */
2011 
2012 boolean_t
machine_timeout_suspended(void)2013 machine_timeout_suspended(void)
2014 {
2015 	return FALSE;
2016 }
2017 
2018 kern_return_t
ml_interrupt_prewarm(__unused uint64_t deadline)2019 ml_interrupt_prewarm(__unused uint64_t deadline)
2020 {
2021 	return KERN_FAILURE;
2022 }
2023 
2024 /*
2025  * Assumes fiq, irq disabled.
2026  */
2027 void
ml_set_decrementer(uint32_t dec_value)2028 ml_set_decrementer(uint32_t dec_value)
2029 {
2030 	cpu_data_t      *cdp = getCpuDatap();
2031 
2032 	assert(ml_get_interrupts_enabled() == FALSE);
2033 	cdp->cpu_decrementer = dec_value;
2034 
2035 	if (cdp->cpu_set_decrementer_func) {
2036 		cdp->cpu_set_decrementer_func(dec_value);
2037 	} else {
2038 		__builtin_arm_wsr64("CNTV_TVAL_EL0", (uint64_t)dec_value);
2039 	}
2040 }
2041 
2042 /**
2043  * Reads from a non-speculative view of the timebase.  If no such view exists on
2044  * this CPU, then an ISB is used to prevent speculation instead.
2045  *
2046  * @return the current value of the hardware timebase
2047  */
2048 static inline uint64_t
nonspeculative_timebase(void)2049 nonspeculative_timebase(void)
2050 {
2051 #if defined(HAS_ACNTVCT)
2052 	return __builtin_arm_rsr64("ACNTVCT_EL0");
2053 #elif __ARM_ARCH_8_6__
2054 	return __builtin_arm_rsr64("CNTVCTSS_EL0");
2055 #else
2056 	// ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
2057 	// "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
2058 	// to other instructions executed on the same processor."
2059 	__builtin_arm_isb(ISB_SY);
2060 	return __builtin_arm_rsr64("CNTVCT_EL0");
2061 #endif
2062 }
2063 
2064 
2065 uint64_t
ml_get_hwclock()2066 ml_get_hwclock()
2067 {
2068 	uint64_t timebase = nonspeculative_timebase();
2069 	return timebase;
2070 }
2071 
2072 uint64_t
ml_get_timebase()2073 ml_get_timebase()
2074 {
2075 	uint64_t clock, timebase;
2076 
2077 	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
2078 	do {
2079 		timebase = getCpuDatap()->cpu_base_timebase;
2080 		os_compiler_barrier();
2081 		clock = ml_get_hwclock();
2082 		os_compiler_barrier();
2083 	} while (getCpuDatap()->cpu_base_timebase != timebase);
2084 
2085 	return clock + timebase;
2086 }
2087 
2088 /**
2089  * Issue a barrier that guarantees all prior memory accesses will complete
2090  * before any subsequent timebase reads.
2091  */
2092 void
ml_memory_to_timebase_fence(void)2093 ml_memory_to_timebase_fence(void)
2094 {
2095 	__builtin_arm_dmb(DMB_SY);
2096 	const uint64_t take_backwards_branch = 0;
2097 	asm volatile (
2098         "1:"
2099                 "ldr	x0, [%[take_backwards_branch]]" "\n"
2100                 "cbnz	x0, 1b"                         "\n"
2101                 :
2102                 : [take_backwards_branch] "r"(&take_backwards_branch)
2103                 : "x0"
2104         );
2105 
2106 	/* throwaway read to prevent ml_get_speculative_timebase() reordering */
2107 	(void)ml_get_hwclock();
2108 }
2109 
2110 /**
2111  * Issue a barrier that guarantees all prior timebase reads will
2112  * be ordered before any subsequent memory accesses.
2113  */
2114 void
ml_timebase_to_memory_fence(void)2115 ml_timebase_to_memory_fence(void)
2116 {
2117 	__builtin_arm_isb(ISB_SY);
2118 }
2119 
2120 /*
2121  * Get the speculative timebase without an ISB.
2122  */
2123 uint64_t
ml_get_speculative_timebase(void)2124 ml_get_speculative_timebase(void)
2125 {
2126 	uint64_t clock, timebase;
2127 
2128 	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
2129 	do {
2130 		timebase = getCpuDatap()->cpu_base_timebase;
2131 		os_compiler_barrier();
2132 		clock = __builtin_arm_rsr64("CNTVCT_EL0");
2133 
2134 		os_compiler_barrier();
2135 	} while (getCpuDatap()->cpu_base_timebase != timebase);
2136 
2137 	return clock + timebase;
2138 }
2139 
2140 uint64_t
ml_get_timebase_entropy(void)2141 ml_get_timebase_entropy(void)
2142 {
2143 	return ml_get_speculative_timebase();
2144 }
2145 
2146 uint32_t
ml_get_decrementer(void)2147 ml_get_decrementer(void)
2148 {
2149 	cpu_data_t *cdp = getCpuDatap();
2150 	uint32_t dec;
2151 
2152 	assert(ml_get_interrupts_enabled() == FALSE);
2153 
2154 	if (cdp->cpu_get_decrementer_func) {
2155 		dec = cdp->cpu_get_decrementer_func();
2156 	} else {
2157 		uint64_t wide_val;
2158 
2159 		wide_val = __builtin_arm_rsr64("CNTV_TVAL_EL0");
2160 		dec = (uint32_t)wide_val;
2161 		assert(wide_val == (uint64_t)dec);
2162 	}
2163 
2164 	return dec;
2165 }
2166 
2167 boolean_t
ml_get_timer_pending(void)2168 ml_get_timer_pending(void)
2169 {
2170 	uint64_t cntv_ctl = __builtin_arm_rsr64("CNTV_CTL_EL0");
2171 	return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
2172 }
2173 
2174 __attribute__((noreturn))
2175 void
platform_syscall(arm_saved_state_t * state)2176 platform_syscall(arm_saved_state_t *state)
2177 {
2178 	uint32_t code;
2179 
2180 #define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2181 
2182 	code = (uint32_t)get_saved_state_reg(state, 3);
2183 
2184 	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_START,
2185 	    get_saved_state_reg(state, 0),
2186 	    get_saved_state_reg(state, 1),
2187 	    get_saved_state_reg(state, 2));
2188 
2189 	switch (code) {
2190 	case 2:
2191 		/* set cthread */
2192 		platform_syscall_kprintf("set cthread self.\n");
2193 		thread_set_cthread_self(get_saved_state_reg(state, 0));
2194 		break;
2195 	case 3:
2196 		/* get cthread */
2197 		platform_syscall_kprintf("get cthread self.\n");
2198 		set_saved_state_reg(state, 0, thread_get_cthread_self());
2199 		break;
2200 	case 0: /* I-Cache flush (removed) */
2201 	case 1: /* D-Cache flush (removed) */
2202 	default:
2203 		platform_syscall_kprintf("unknown: %d\n", code);
2204 		break;
2205 	}
2206 
2207 	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_END,
2208 	    get_saved_state_reg(state, 0));
2209 
2210 	thread_exception_return();
2211 }
2212 
2213 static void
_enable_timebase_event_stream(uint32_t bit_index)2214 _enable_timebase_event_stream(uint32_t bit_index)
2215 {
2216 	uint64_t cntkctl; /* One wants to use 32 bits, but "mrs" prefers it this way */
2217 
2218 	if (bit_index >= 64) {
2219 		panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2220 	}
2221 
2222 	__asm__ volatile ("mrs	%0, CNTKCTL_EL1" : "=r"(cntkctl));
2223 
2224 	cntkctl |= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2225 	cntkctl |= CNTKCTL_EL1_EVNTEN;
2226 	cntkctl |= CNTKCTL_EL1_EVENTDIR; /* 1->0; why not? */
2227 
2228 	/*
2229 	 * If the SOC supports it (and it isn't broken), enable
2230 	 * EL0 access to the timebase registers.
2231 	 */
2232 	if (user_timebase_type() != USER_TIMEBASE_NONE) {
2233 		cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2234 	}
2235 
2236 	__builtin_arm_wsr64("CNTKCTL_EL1", cntkctl);
2237 }
2238 
2239 /*
2240  * Turn timer on, unmask that interrupt.
2241  */
2242 static void
_enable_virtual_timer(void)2243 _enable_virtual_timer(void)
2244 {
2245 	uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
2246 
2247 	__builtin_arm_wsr64("CNTV_CTL_EL0", cntvctl);
2248 	/* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
2249 	__builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2250 }
2251 
2252 void
fiq_context_init(boolean_t enable_fiq __unused)2253 fiq_context_init(boolean_t enable_fiq __unused)
2254 {
2255 	/* Interrupts still disabled. */
2256 	assert(ml_get_interrupts_enabled() == FALSE);
2257 	_enable_virtual_timer();
2258 }
2259 
2260 void
wfe_timeout_init(void)2261 wfe_timeout_init(void)
2262 {
2263 	_enable_timebase_event_stream(arm64_eventi);
2264 }
2265 
2266 /**
2267  * Configures, but does not enable, the WFE event stream. The event stream
2268  * generates an event at a set interval to act as a timeout for WFEs.
2269  *
2270  * This function sets the static global variable arm64_eventi to be the proper
2271  * bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2272  * period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2273  * is used by wfe_timeout_init to actually poke the registers and enable the
2274  * event stream.
2275  *
2276  * The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2277  * is the trigger for the system to generate an event. The trigger can occur on
2278  * either the rising or falling edge of the bit depending on the value of
2279  * CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2280  * falling edge (1->0) transition to generate events.
2281  */
2282 void
wfe_timeout_configure(void)2283 wfe_timeout_configure(void)
2284 {
2285 	/* Could fill in our own ops here, if we needed them */
2286 	uint64_t        ticks_per_sec, ticks_per_event, events_per_sec = 0;
2287 	uint32_t        bit_index;
2288 
2289 	if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
2290 		if (events_per_sec <= 0) {
2291 			events_per_sec = 1;
2292 		} else if (events_per_sec > USEC_PER_SEC) {
2293 			events_per_sec = USEC_PER_SEC;
2294 		}
2295 	} else {
2296 		events_per_sec = USEC_PER_SEC;
2297 	}
2298 	ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2299 	ticks_per_event = ticks_per_sec / events_per_sec;
2300 
2301 	/* Bit index of next power of two greater than ticks_per_event */
2302 	bit_index = flsll(ticks_per_event) - 1;
2303 	/* Round up to next power of two if ticks_per_event is initially power of two */
2304 	if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
2305 		bit_index++;
2306 	}
2307 
2308 	/*
2309 	 * The timer can only trigger on rising or falling edge, not both; we don't
2310 	 * care which we trigger on, but we do need to adjust which bit we are
2311 	 * interested in to account for this.
2312 	 *
2313 	 * In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2314 	 * falling edge of the given bit. Therefore, we must decrement the bit index
2315 	 * by one as when the bit before the one we care about makes a 1 -> 0
2316 	 * transition, the bit we care about makes a 0 -> 1 transition.
2317 	 *
2318 	 * For example if we want an event generated every 8 ticks (if we calculated
2319 	 * a bit_index of 3), we would want the event to be generated whenever the
2320 	 * lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2321 	 * see that the bit at index 2 makes a falling transition in this scenario,
2322 	 * so we would want EVENTI to be 2 instead of 3.
2323 	 */
2324 	if (bit_index != 0) {
2325 		bit_index--;
2326 	}
2327 
2328 	arm64_eventi = bit_index;
2329 }
2330 
2331 boolean_t
ml_delay_should_spin(uint64_t interval)2332 ml_delay_should_spin(uint64_t interval)
2333 {
2334 	cpu_data_t     *cdp = getCpuDatap();
2335 
2336 	if (cdp->cpu_idle_latency) {
2337 		return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2338 	} else {
2339 		/*
2340 		 * Early boot, latency is unknown. Err on the side of blocking,
2341 		 * which should always be safe, even if slow
2342 		 */
2343 		return FALSE;
2344 	}
2345 }
2346 
2347 boolean_t
ml_thread_is64bit(thread_t thread)2348 ml_thread_is64bit(thread_t thread)
2349 {
2350 	return thread_is_64bit_addr(thread);
2351 }
2352 
2353 void
ml_delay_on_yield(void)2354 ml_delay_on_yield(void)
2355 {
2356 #if DEVELOPMENT || DEBUG
2357 	if (yield_delay_us) {
2358 		delay(yield_delay_us);
2359 	}
2360 #endif
2361 }
2362 
2363 void
ml_timer_evaluate(void)2364 ml_timer_evaluate(void)
2365 {
2366 }
2367 
2368 boolean_t
ml_timer_forced_evaluation(void)2369 ml_timer_forced_evaluation(void)
2370 {
2371 	return FALSE;
2372 }
2373 
2374 void
ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)2375 ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2376 {
2377 	/*
2378 	 * For now: update the resource coalition stats of the
2379 	 * current thread's coalition
2380 	 */
2381 	task_coalition_update_gpu_stats(current_task(), gpu_ns_delta);
2382 }
2383 
2384 uint64_t
ml_gpu_stat(__unused thread_t t)2385 ml_gpu_stat(__unused thread_t t)
2386 {
2387 	return 0;
2388 }
2389 
2390 thread_t
current_thread(void)2391 current_thread(void)
2392 {
2393 	return current_thread_fast();
2394 }
2395 
2396 typedef struct{
2397 	ex_cb_t         cb;
2398 	void            *refcon;
2399 }
2400 ex_cb_info_t;
2401 
2402 ex_cb_info_t ex_cb_info[EXCB_CLASS_MAX];
2403 
2404 /*
2405  * Callback registration
2406  * Currently we support only one registered callback per class but
2407  * it should be possible to support more callbacks
2408  */
2409 kern_return_t
ex_cb_register(ex_cb_class_t cb_class,ex_cb_t cb,void * refcon)2410 ex_cb_register(
2411 	ex_cb_class_t   cb_class,
2412 	ex_cb_t                 cb,
2413 	void                    *refcon)
2414 {
2415 	ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2416 
2417 	if ((NULL == cb) || (cb_class >= EXCB_CLASS_MAX)) {
2418 		return KERN_INVALID_VALUE;
2419 	}
2420 
2421 	if (NULL == pInfo->cb) {
2422 		pInfo->cb = cb;
2423 		pInfo->refcon = refcon;
2424 		return KERN_SUCCESS;
2425 	}
2426 	return KERN_FAILURE;
2427 }
2428 
2429 /*
2430  * Called internally by platform kernel to invoke the registered callback for class
2431  */
2432 ex_cb_action_t
ex_cb_invoke(ex_cb_class_t cb_class,vm_offset_t far)2433 ex_cb_invoke(
2434 	ex_cb_class_t   cb_class,
2435 	vm_offset_t             far)
2436 {
2437 	ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2438 	ex_cb_state_t state = {far};
2439 
2440 	if (cb_class >= EXCB_CLASS_MAX) {
2441 		panic("Invalid exception callback class 0x%x", cb_class);
2442 	}
2443 
2444 	if (pInfo->cb) {
2445 		return pInfo->cb(cb_class, pInfo->refcon, &state);
2446 	}
2447 	return EXCB_ACTION_NONE;
2448 }
2449 
2450 #if defined(HAS_APPLE_PAC)
2451 void
ml_task_set_disable_user_jop(task_t task,uint8_t disable_user_jop)2452 ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2453 {
2454 	assert(task);
2455 	task->disable_user_jop = disable_user_jop;
2456 }
2457 
2458 void
ml_thread_set_disable_user_jop(thread_t thread,uint8_t disable_user_jop)2459 ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2460 {
2461 	assert(thread);
2462 	if (disable_user_jop) {
2463 		thread->machine.arm_machine_flags |= ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2464 	} else {
2465 		thread->machine.arm_machine_flags &= ~ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2466 	}
2467 }
2468 
2469 void
ml_task_set_rop_pid(task_t task,task_t parent_task,boolean_t inherit)2470 ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2471 {
2472 	if (inherit) {
2473 		task->rop_pid = parent_task->rop_pid;
2474 	} else {
2475 		task->rop_pid = early_random();
2476 	}
2477 }
2478 
2479 /**
2480  * jop_pid may be inherited from the parent task or generated inside the shared
2481  * region.  Unfortunately these two parameters are available at very different
2482  * times during task creation, so we need to split this into two steps.
2483  */
2484 void
ml_task_set_jop_pid(task_t task,task_t parent_task,boolean_t inherit)2485 ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit)
2486 {
2487 	if (inherit) {
2488 		task->jop_pid = parent_task->jop_pid;
2489 	} else {
2490 		task->jop_pid = ml_default_jop_pid();
2491 	}
2492 }
2493 
2494 void
ml_task_set_jop_pid_from_shared_region(task_t task)2495 ml_task_set_jop_pid_from_shared_region(task_t task)
2496 {
2497 	vm_shared_region_t sr = vm_shared_region_get(task);
2498 	/*
2499 	 * If there's no shared region, we can assign the key arbitrarily.  This
2500 	 * typically happens when Mach-O image activation failed part of the way
2501 	 * through, and this task is in the middle of dying with SIGKILL anyway.
2502 	 */
2503 	if (__improbable(!sr)) {
2504 		task->jop_pid = early_random();
2505 		return;
2506 	}
2507 	vm_shared_region_deallocate(sr);
2508 
2509 	/*
2510 	 * Similarly we have to worry about jetsam having killed the task and
2511 	 * already cleared the shared_region_id.
2512 	 */
2513 	task_lock(task);
2514 	if (task->shared_region_id != NULL) {
2515 		task->jop_pid = shared_region_find_key(task->shared_region_id);
2516 	} else {
2517 		task->jop_pid = early_random();
2518 	}
2519 	task_unlock(task);
2520 }
2521 
2522 void
ml_thread_set_jop_pid(thread_t thread,task_t task)2523 ml_thread_set_jop_pid(thread_t thread, task_t task)
2524 {
2525 	thread->machine.jop_pid = task->jop_pid;
2526 }
2527 #endif /* defined(HAS_APPLE_PAC) */
2528 
2529 #if DEVELOPMENT || DEBUG
2530 static uint64_t minor_badness_suffered = 0;
2531 #endif
2532 void
ml_report_minor_badness(uint32_t __unused badness_id)2533 ml_report_minor_badness(uint32_t __unused badness_id)
2534 {
2535 	#if DEVELOPMENT || DEBUG
2536 	(void)os_atomic_or(&minor_badness_suffered, 1ULL << badness_id, relaxed);
2537 	#endif
2538 }
2539 
2540 #if defined(HAS_APPLE_PAC)
2541 #if __ARM_ARCH_8_6__ || APPLEVIRTUALPLATFORM
2542 /**
2543  * The ARMv8.6 implementation is also safe for non-FPAC CPUs, but less efficient;
2544  * guest kernels need to use it because it does not know at compile time whether
2545  * the host CPU supports FPAC.
2546  */
2547 
2548 /**
2549  * Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2550  */
2551 static void *
ml_poison_ptr(void * ptr,ptrauth_key key)2552 ml_poison_ptr(void *ptr, ptrauth_key key)
2553 {
2554 	bool b_key = key & (1ULL << 0);
2555 	uint64_t error_code;
2556 	if (b_key) {
2557 		error_code = 2;
2558 	} else {
2559 		error_code = 1;
2560 	}
2561 
2562 	bool kernel_pointer = (uintptr_t)ptr & (1ULL << 55);
2563 	bool data_key = key & (1ULL << 1);
2564 	/* When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters */
2565 	bool tbi = data_key && !kernel_pointer;
2566 	unsigned int poison_shift;
2567 	if (tbi) {
2568 		poison_shift = 53;
2569 	} else {
2570 		poison_shift = 61;
2571 	}
2572 
2573 	uintptr_t poisoned = (uintptr_t)ptr;
2574 	poisoned &= ~(3ULL << poison_shift);
2575 	poisoned |= error_code << poison_shift;
2576 	return (void *)poisoned;
2577 }
2578 
2579 /*
2580  * ptrauth_sign_unauthenticated() reimplemented using asm volatile, forcing the
2581  * compiler to assume this operation has side-effects and cannot be reordered
2582  */
2583 #define ptrauth_sign_volatile(__value, __suffix, __data)                \
2584 	({                                                              \
2585 	        void *__ret = __value;                                  \
2586 	        asm volatile (                                          \
2587 	                "pac" #__suffix "	%[value], %[data]"          \
2588 	                : [value] "+r"(__ret)                           \
2589 	                : [data] "r"(__data)                            \
2590 	        );                                                      \
2591 	        __ret;                                                  \
2592 	})
2593 
2594 #define ml_auth_ptr_unchecked_for_key(_ptr, _suffix, _key, _modifier)                           \
2595 	do {                                                                                    \
2596 	        void *stripped = ptrauth_strip(_ptr, _key);                                     \
2597 	        void *reauthed = ptrauth_sign_volatile(stripped, _suffix, _modifier);           \
2598 	        if (__probable(_ptr == reauthed)) {                                             \
2599 	                _ptr = stripped;                                                        \
2600 	        } else {                                                                        \
2601 	                _ptr = ml_poison_ptr(stripped, _key);                                   \
2602 	        }                                                                               \
2603 	} while (0)
2604 
2605 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2606 	ml_auth_ptr_unchecked_for_key(_ptr, _suffix, ptrauth_key_as ## _suffix, _modifier)
2607 #else
2608 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2609 	asm volatile ("aut" #_suffix " %[ptr], %[modifier]" : [ptr] "+r"(_ptr) : [modifier] "r"(_modifier));
2610 #endif /* __ARM_ARCH_8_6__ || APPLEVIRTUALPLATFORM */
2611 
2612 /**
2613  * Authenticates a signed pointer without trapping on failure.
2614  *
2615  * @warning This function must be called with interrupts disabled.
2616  *
2617  * @warning Pointer authentication failure should normally be treated as a fatal
2618  * error.  This function is intended for a handful of callers that cannot panic
2619  * on failure, and that understand the risks in handling a poisoned return
2620  * value.  Other code should generally use the trapping variant
2621  * ptrauth_auth_data() instead.
2622  *
2623  * @param ptr the pointer to authenticate
2624  * @param key which key to use for authentication
2625  * @param modifier a modifier to mix into the key
2626  * @return an authenticated version of ptr, possibly with poison bits set
2627  */
2628 void *
ml_auth_ptr_unchecked(void * ptr,ptrauth_key key,uint64_t modifier)2629 ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier)
2630 {
2631 	switch (key & 0x3) {
2632 	case ptrauth_key_asia:
2633 		_ml_auth_ptr_unchecked(ptr, ia, modifier);
2634 		break;
2635 	case ptrauth_key_asib:
2636 		_ml_auth_ptr_unchecked(ptr, ib, modifier);
2637 		break;
2638 	case ptrauth_key_asda:
2639 		_ml_auth_ptr_unchecked(ptr, da, modifier);
2640 		break;
2641 	case ptrauth_key_asdb:
2642 		_ml_auth_ptr_unchecked(ptr, db, modifier);
2643 		break;
2644 	}
2645 
2646 	return ptr;
2647 }
2648 #endif /* defined(HAS_APPLE_PAC) */
2649 
2650 #ifdef CONFIG_XNUPOST
2651 void
ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_addr)2652 ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2653 {
2654 	thread_t thread = current_thread();
2655 	thread->machine.expected_fault_handler = expected_fault_handler;
2656 	thread->machine.expected_fault_addr = expected_fault_addr;
2657 }
2658 
2659 void
ml_expect_fault_end(void)2660 ml_expect_fault_end(void)
2661 {
2662 	thread_t thread = current_thread();
2663 	thread->machine.expected_fault_handler = NULL;
2664 	thread->machine.expected_fault_addr = 0;
2665 }
2666 #endif /* CONFIG_XNUPOST */
2667 
2668 void
ml_hibernate_active_pre(void)2669 ml_hibernate_active_pre(void)
2670 {
2671 #if HIBERNATION
2672 	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2673 
2674 		hibernate_rebuild_vm_structs();
2675 	}
2676 #endif /* HIBERNATION */
2677 }
2678 
2679 void
ml_hibernate_active_post(void)2680 ml_hibernate_active_post(void)
2681 {
2682 #if HIBERNATION
2683 	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2684 		hibernate_machine_init();
2685 		hibernate_vm_lock_end();
2686 		current_cpu_datap()->cpu_hibernate = 0;
2687 	}
2688 #endif /* HIBERNATION */
2689 }
2690 
2691 /**
2692  * Return back a machine-dependent array of address space regions that should be
2693  * reserved by the VM (pre-mapped in the address space). This will prevent user
2694  * processes from allocating or deallocating from within these regions.
2695  *
2696  * @param vm_is64bit True if the process has a 64-bit address space.
2697  * @param regions An out parameter representing an array of regions to reserve.
2698  *
2699  * @return The number of reserved regions returned through `regions`.
2700  */
2701 size_t
ml_get_vm_reserved_regions(bool vm_is64bit,const struct vm_reserved_region ** regions)2702 ml_get_vm_reserved_regions(bool vm_is64bit, const struct vm_reserved_region **regions)
2703 {
2704 	assert(regions != NULL);
2705 
2706 	/**
2707 	 * Reserved regions only apply to 64-bit address spaces. This is because
2708 	 * we only expect to grow the maximum user VA address on 64-bit address spaces
2709 	 * (we've essentially already reached the max for 32-bit spaces). The reserved
2710 	 * regions should safely fall outside of the max user VA for 32-bit processes.
2711 	 */
2712 	if (vm_is64bit) {
2713 		*regions = vm_reserved_regions;
2714 		return ARRAY_COUNT(vm_reserved_regions);
2715 	} else {
2716 		/* Don't reserve any VA regions on arm64_32 processes. */
2717 		*regions = NULL;
2718 		return 0;
2719 	}
2720 }
2721 
2722 /* These WFE recommendations are expected to be updated on a relatively
2723  * infrequent cadence, possibly from a different cluster, hence
2724  * false cacheline sharing isn't expected to be material
2725  */
2726 static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
2727 
2728 uint32_t
ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id,uint64_t wfe_timeout_abstime_interval,__unused uint64_t wfe_hint_flags)2729 ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
2730 {
2731 	assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
2732 	assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
2733 	os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
2734 	return 0; /* Success */
2735 }
2736 
2737 #if DEVELOPMENT || DEBUG
2738 int wfe_rec_max = 0;
2739 int wfe_rec_none = 0;
2740 uint64_t wfe_rec_override_mat = 0;
2741 uint64_t wfe_rec_clamp = 0;
2742 #endif
2743 
2744 uint64_t
ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)2745 ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
2746 {
2747 	/* This and its consumer does not synchronize vis-a-vis updates
2748 	 * of the recommendation; races are acceptable.
2749 	 */
2750 	uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
2751 #if DEVELOPMENT || DEBUG
2752 	if (wfe_rec_clamp) {
2753 		wfet = MIN(wfe_rec_clamp, wfet);
2754 	}
2755 
2756 	if (wfe_rec_max) {
2757 		for (int i = 0; i < MAX_CPU_CLUSTERS; i++) {
2758 			if (arm64_cluster_wfe_recs[i] > wfet) {
2759 				wfet = arm64_cluster_wfe_recs[i];
2760 			}
2761 		}
2762 	}
2763 
2764 	if (wfe_rec_none) {
2765 		wfet = 0;
2766 	}
2767 
2768 	if (wfe_rec_override_mat) {
2769 		wfet = wfe_rec_override_mat;
2770 	}
2771 #endif
2772 	return wfet;
2773 }
2774 
2775 __pure2 bool
ml_addr_in_non_xnu_stack(__unused uintptr_t addr)2776 ml_addr_in_non_xnu_stack(__unused uintptr_t addr)
2777 {
2778 #if   XNU_MONITOR
2779 	return (addr >= (uintptr_t)pmap_stacks_start) && (addr < (uintptr_t)pmap_stacks_end);
2780 #else
2781 	return false;
2782 #endif /* XNU_MONITOR */
2783 }
2784 
2785 uint64_t
ml_get_backtrace_pc(struct arm_saved_state * state)2786 ml_get_backtrace_pc(struct arm_saved_state *state)
2787 {
2788 	assert((state != NULL) && is_saved_state64(state));
2789 
2790 
2791 	return get_saved_state_pc(state);
2792 }
2793