xref: /xnu-8020.121.3/osfmk/arm64/machine_routines.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <arm64/machine_machdep.h>
30 #include <arm64/proc_reg.h>
31 #include <arm/machine_cpu.h>
32 #include <arm/cpu_internal.h>
33 #include <arm/cpuid.h>
34 #include <arm/io_map_entries.h>
35 #include <arm/cpu_data.h>
36 #include <arm/cpu_data_internal.h>
37 #include <arm/caches_internal.h>
38 #include <arm/misc_protos.h>
39 #include <arm/machdep_call.h>
40 #include <arm/machine_routines.h>
41 #include <arm/rtclock.h>
42 #include <arm/cpuid_internal.h>
43 #include <arm/cpu_capabilities.h>
44 #include <console/serial_protos.h>
45 #include <kern/machine.h>
46 #include <kern/misc_protos.h>
47 #include <prng/random.h>
48 #include <kern/startup.h>
49 #include <kern/thread.h>
50 #include <kern/timer_queue.h>
51 #include <mach/machine.h>
52 #include <machine/atomic.h>
53 #include <machine/config.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_shared_region.h>
57 #include <vm/vm_map.h>
58 #include <sys/codesign.h>
59 #include <sys/kdebug.h>
60 #include <kern/coalition.h>
61 #include <pexpert/device_tree.h>
62 
63 #include <IOKit/IOPlatformExpert.h>
64 #if HIBERNATION
65 #include <IOKit/IOHibernatePrivate.h>
66 #endif /* HIBERNATION */
67 
68 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
69 #include <arm64/amcc_rorgn.h>
70 #endif
71 
72 
73 #include <libkern/section_keywords.h>
74 
75 /**
76  * On supported hardware, debuggable builds make the HID bits read-only
77  * without locking them.  This lets people manually modify HID bits while
78  * debugging, since they can use a debugging tool to first reset the HID
79  * bits back to read/write.  However it will still catch xnu changes that
80  * accidentally write to HID bits after they've been made read-only.
81  */
82 
83 #if KPC
84 #include <kern/kpc.h>
85 #endif
86 
87 #define MPIDR_CPU_ID(mpidr_el1_val)             (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
88 #define MPIDR_CLUSTER_ID(mpidr_el1_val)         (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
89 
90 #if HAS_CLUSTER
91 static uint8_t cluster_initialized = 0;
92 #endif
93 
94 MACHINE_TIMEOUT32_WRITEABLE(LockTimeOut, "lock", 6e6 /* 0.25s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
95 machine_timeout32_t LockTimeOutUsec; // computed in ml_init_lock_timeout
96 
97 MACHINE_TIMEOUT_WRITEABLE(TLockTimeOut, "ticket-lock", 3e6 /* 0.125s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
98 
99 MACHINE_TIMEOUT32_WRITEABLE(MutexSpin, "mutex-spin", 240 /* 10us */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
100 
101 uint64_t low_MutexSpin;
102 int64_t high_MutexSpin;
103 
104 
105 
106 static uint64_t ml_wfe_hint_max_interval;
107 #define MAX_WFE_HINT_INTERVAL_US (500ULL)
108 
109 /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
110 TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
111 
112 extern vm_offset_t   segLOWEST;
113 extern vm_offset_t   segLOWESTTEXT;
114 extern vm_offset_t   segLASTB;
115 extern unsigned long segSizeLAST;
116 
117 /* ARM64 specific bounds; used to test for presence in the kernelcache. */
118 extern vm_offset_t   vm_kernelcache_base;
119 extern vm_offset_t   vm_kernelcache_top;
120 
121 #if defined(HAS_IPI)
122 unsigned int gFastIPI = 1;
123 #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
124 static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
125     kDeferredIPITimerDefault);
126 #endif /* defined(HAS_IPI) */
127 
128 thread_t Idle_context(void);
129 
130 SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
131 
132 SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
133 SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
134 SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
135 	.version = CPU_TOPOLOGY_VERSION,
136 	.cpus = topology_cpu_array,
137 	.clusters = topology_cluster_array,
138 };
139 
140 _Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
141 
142 /**
143  * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
144  * entries of an arbitrary data type.  This is intended for use by specialized consumers
145  * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
146  * as follows:
147  *	hypothetical_array[cluster_offsets[AFF1] + AFF0]
148  * Most consumers should instead use general-purpose facilities such as PERCPU or
149  * ml_get_cpu_number().
150  */
151 SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
152 
153 SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
154 
155 extern uint32_t lockdown_done;
156 
157 /**
158  * Represents regions of virtual address space that should be reserved
159  * (pre-mapped) in each user address space.
160  */
161 SECURITY_READ_ONLY_LATE(static struct vm_reserved_region) vm_reserved_regions[] = {
162 	{
163 		.vmrr_name = "GPU Carveout",
164 		.vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
165 		.vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
166 	},
167 	/*
168 	 * Reserve the virtual memory space representing the commpage nesting region
169 	 * to prevent user processes from allocating memory within it. The actual
170 	 * page table entries for the commpage are inserted by vm_commpage_enter().
171 	 * This vm_map_enter() just prevents userspace from allocating/deallocating
172 	 * anything within the entire commpage nested region.
173 	 */
174 	{
175 		.vmrr_name = "commpage nesting",
176 		.vmrr_addr = _COMM_PAGE64_NESTING_START,
177 		.vmrr_size = _COMM_PAGE64_NESTING_SIZE
178 	}
179 };
180 
181 uint32_t get_arm_cpu_version(void);
182 
183 #if defined(HAS_IPI)
184 static inline void
ml_cpu_signal_type(unsigned int cpu_mpidr,uint32_t type)185 ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
186 {
187 #if HAS_CLUSTER
188 	uint64_t local_mpidr;
189 	/* NOTE: this logic expects that we are called in a non-preemptible
190 	 * context, or at least one in which the calling thread is bound
191 	 * to a single CPU.  Otherwise we may migrate between choosing which
192 	 * IPI mechanism to use and issuing the IPI. */
193 	MRS(local_mpidr, "MPIDR_EL1");
194 	if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
195 		uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
196 		MSR("S3_5_C15_C0_0", x);
197 	} else {
198 		#define IPI_RR_TARGET_CLUSTER_SHIFT 16
199 		uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
200 		MSR("S3_5_C15_C0_1", x);
201 	}
202 #else
203 	uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
204 	MSR("S3_5_C15_C0_1", x);
205 #endif
206 }
207 #endif
208 
209 #if !defined(HAS_IPI)
210 __dead2
211 #endif
212 void
ml_cpu_signal(unsigned int cpu_mpidr __unused)213 ml_cpu_signal(unsigned int cpu_mpidr __unused)
214 {
215 #if defined(HAS_IPI)
216 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
217 #else
218 	panic("Platform does not support ACC Fast IPI");
219 #endif
220 }
221 
222 #if !defined(HAS_IPI)
223 __dead2
224 #endif
225 void
ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)226 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
227 {
228 #if defined(HAS_IPI)
229 	/* adjust IPI_CR timer countdown value for deferred IPI
230 	 * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
231 	 * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
232 	 *
233 	 * global register, should only require a single write to update all
234 	 * CPU cores: from Skye ACC user spec section 5.7.3.3
235 	 *
236 	 * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
237 	 * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
238 	 */
239 	uint64_t abstime;
240 
241 	nanoseconds_to_absolutetime(nanosecs, &abstime);
242 
243 	abstime = MIN(abstime, 0xFFFF);
244 
245 	/* update deferred_ipi_timer_ns with the new clamped value */
246 	absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
247 
248 	MSR("S3_5_C15_C3_1", abstime);
249 #else
250 	(void)nanosecs;
251 	panic("Platform does not support ACC Fast IPI");
252 #endif
253 }
254 
255 uint64_t
ml_cpu_signal_deferred_get_timer()256 ml_cpu_signal_deferred_get_timer()
257 {
258 #if defined(HAS_IPI)
259 	return deferred_ipi_timer_ns;
260 #else
261 	return 0;
262 #endif
263 }
264 
265 #if !defined(HAS_IPI)
266 __dead2
267 #endif
268 void
ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)269 ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
270 {
271 #if defined(HAS_IPI)
272 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
273 #else
274 	panic("Platform does not support ACC Fast IPI deferral");
275 #endif
276 }
277 
278 #if !defined(HAS_IPI)
279 __dead2
280 #endif
281 void
ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)282 ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
283 {
284 #if defined(HAS_IPI)
285 	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
286 #else
287 	panic("Platform does not support ACC Fast IPI retraction");
288 #endif
289 }
290 
291 extern uint32_t idle_proximate_io_wfe_unmasked;
292 
293 #define CPUPM_IDLE_WFE 0x5310300
294 static bool
wfe_process_recommendation(void)295 wfe_process_recommendation(void)
296 {
297 	bool ipending = false;
298 	if (__probable(idle_proximate_io_wfe_unmasked == 1)) {
299 		/* Check for an active perf. controller generated
300 		 * WFE recommendation for this cluster.
301 		 */
302 		cpu_data_t *cdp = getCpuDatap();
303 		uint32_t cid = cdp->cpu_cluster_id;
304 		uint64_t wfe_ttd = 0;
305 		uint64_t wfe_deadline = 0;
306 
307 		if ((wfe_ttd = ml_cluster_wfe_timeout(cid)) != 0) {
308 			wfe_deadline = mach_absolute_time() + wfe_ttd;
309 		}
310 
311 		if (wfe_deadline != 0) {
312 			/* Poll issuing event-bounded WFEs until an interrupt
313 			 * arrives or the WFE recommendation expires
314 			 */
315 #if DEVELOPMENT || DEBUG
316 			uint64_t wc = cdp->wfe_count;
317 			KDBG(CPUPM_IDLE_WFE | DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
318 #endif
319 			/* Issue WFE until the recommendation expires,
320 			 * with IRQs unmasked.
321 			 */
322 			ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true);
323 #if DEVELOPMENT || DEBUG
324 			KDBG(CPUPM_IDLE_WFE | DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
325 #endif
326 		}
327 	}
328 	return ipending;
329 }
330 
331 void
machine_idle(void)332 machine_idle(void)
333 {
334 	/* Interrupts are expected to be masked on entry or re-entry via
335 	 * Idle_load_context()
336 	 */
337 	assert((__builtin_arm_rsr("DAIF") & (DAIF_IRQF | DAIF_FIQF)) == (DAIF_IRQF | DAIF_FIQF));
338 	/* Check for, and act on, a WFE recommendation.
339 	 * Bypasses context spill/fill for a minor perf. increment.
340 	 * May unmask and restore IRQ+FIQ mask.
341 	 */
342 	if (wfe_process_recommendation() == false) {
343 		/* If WFE recommendation absent, or WFE deadline
344 		 * arrived with no interrupt pending/processed,
345 		 * fall back to WFI.
346 		 */
347 		Idle_context();
348 	}
349 	__builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
350 }
351 
352 void
OSSynchronizeIO(void)353 OSSynchronizeIO(void)
354 {
355 	__builtin_arm_dsb(DSB_SY);
356 }
357 
358 uint64_t
get_aux_control(void)359 get_aux_control(void)
360 {
361 	uint64_t        value;
362 
363 	MRS(value, "ACTLR_EL1");
364 	return value;
365 }
366 
367 uint64_t
get_mmu_control(void)368 get_mmu_control(void)
369 {
370 	uint64_t        value;
371 
372 	MRS(value, "SCTLR_EL1");
373 	return value;
374 }
375 
376 uint64_t
get_tcr(void)377 get_tcr(void)
378 {
379 	uint64_t        value;
380 
381 	MRS(value, "TCR_EL1");
382 	return value;
383 }
384 
385 boolean_t
ml_get_interrupts_enabled(void)386 ml_get_interrupts_enabled(void)
387 {
388 	uint64_t        value;
389 
390 	MRS(value, "DAIF");
391 	if (value & DAIF_IRQF) {
392 		return FALSE;
393 	}
394 	return TRUE;
395 }
396 
397 pmap_paddr_t
get_mmu_ttb(void)398 get_mmu_ttb(void)
399 {
400 	pmap_paddr_t    value;
401 
402 	MRS(value, "TTBR0_EL1");
403 	return value;
404 }
405 
406 uint32_t
get_arm_cpu_version(void)407 get_arm_cpu_version(void)
408 {
409 	uint32_t value = machine_read_midr();
410 
411 	/* Compose the register values into 8 bits; variant[7:4], revision[3:0]. */
412 	return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
413 }
414 
415 bool
ml_feature_supported(uint32_t feature_bit)416 ml_feature_supported(uint32_t feature_bit)
417 {
418 	uint64_t aidr_el1_value = 0;
419 
420 	MRS(aidr_el1_value, "AIDR_EL1");
421 
422 
423 	return aidr_el1_value & feature_bit;
424 }
425 
426 /*
427  * user_cont_hwclock_allowed()
428  *
429  * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
430  * as a continuous time source (e.g. from mach_continuous_time)
431  */
432 boolean_t
user_cont_hwclock_allowed(void)433 user_cont_hwclock_allowed(void)
434 {
435 #if HAS_CONTINUOUS_HWCLOCK
436 	return TRUE;
437 #else
438 	return FALSE;
439 #endif
440 }
441 
442 /*
443  * user_timebase_type()
444  *
445  * Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
446  *
447  * USER_TIMEBASE_NONE: EL0 has no access to timebase register
448  * USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
449  * USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
450  *
451  */
452 
453 uint8_t
user_timebase_type(void)454 user_timebase_type(void)
455 {
456 #if   __ARM_ARCH_8_6__
457 	return USER_TIMEBASE_NOSPEC;
458 #else
459 	return USER_TIMEBASE_SPEC;
460 #endif
461 }
462 
463 void
machine_startup(__unused boot_args * args)464 machine_startup(__unused boot_args * args)
465 {
466 #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
467 	if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
468 		gFastIPI = 1;
469 	}
470 #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
471 
472 
473 	machine_conf();
474 
475 
476 	/*
477 	 * Kick off the kernel bootstrap.
478 	 */
479 	kernel_bootstrap();
480 	/* NOTREACHED */
481 }
482 
483 typedef void (*invalidate_fn_t)(void);
484 
485 static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
486 
487 void set_invalidate_hmac_function(invalidate_fn_t fn);
488 
489 void
set_invalidate_hmac_function(invalidate_fn_t fn)490 set_invalidate_hmac_function(invalidate_fn_t fn)
491 {
492 	if (NULL != invalidate_hmac_function) {
493 		panic("Invalidate HMAC function already set");
494 	}
495 
496 	invalidate_hmac_function = fn;
497 }
498 
499 void
machine_lockdown(void)500 machine_lockdown(void)
501 {
502 	arm_vm_prot_finalize(PE_state.bootArgs);
503 
504 #if CONFIG_KERNEL_INTEGRITY
505 #if KERNEL_INTEGRITY_WT
506 	/* Watchtower
507 	 *
508 	 * Notify the monitor about the completion of early kernel bootstrap.
509 	 * From this point forward it will enforce the integrity of kernel text,
510 	 * rodata and page tables.
511 	 */
512 
513 #ifdef MONITOR
514 	monitor_call(MONITOR_LOCKDOWN, 0, 0, 0);
515 #endif
516 #endif /* KERNEL_INTEGRITY_WT */
517 
518 #if XNU_MONITOR
519 	pmap_lockdown_ppl();
520 #endif
521 
522 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
523 	/* KTRR
524 	 *
525 	 * Lock physical KTRR region. KTRR region is read-only. Memory outside
526 	 * the region is not executable at EL1.
527 	 */
528 
529 	rorgn_lockdown();
530 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
531 
532 
533 #endif /* CONFIG_KERNEL_INTEGRITY */
534 
535 
536 	if (NULL != invalidate_hmac_function) {
537 		invalidate_hmac_function();
538 	}
539 
540 	lockdown_done = 1;
541 }
542 
543 
544 char           *
machine_boot_info(__unused char * buf,__unused vm_size_t size)545 machine_boot_info(
546 	__unused char *buf,
547 	__unused vm_size_t size)
548 {
549 	return PE_boot_args();
550 }
551 
552 void
slave_machine_init(__unused void * param)553 slave_machine_init(__unused void *param)
554 {
555 	cpu_machine_init();     /* Initialize the processor */
556 	clock_init();           /* Init the clock */
557 }
558 
559 /*
560  *	Routine:        machine_processor_shutdown
561  *	Function:
562  */
563 thread_t
machine_processor_shutdown(__unused thread_t thread,void (* doshutdown)(processor_t),processor_t processor)564 machine_processor_shutdown(
565 	__unused thread_t thread,
566 	void (*doshutdown)(processor_t),
567 	processor_t processor)
568 {
569 	return Shutdown_context(doshutdown, processor);
570 }
571 
572 /*
573  *      Routine:        ml_init_lock_timeout
574  *      Function:
575  */
576 void
ml_init_lock_timeout(void)577 ml_init_lock_timeout(void)
578 {
579 	/*
580 	 * This function is called after STARUP_SUB_TIMEOUTS
581 	 * initialization, so using the "legacy" boot-args here overrides
582 	 * the ml-timeout-...  configuration. (Given that these boot-args
583 	 * here are usually explicitly specified, this makes sense by
584 	 * overriding ml-timeout-..., which may come from the device tree.
585 	 */
586 
587 	uint64_t lto_timeout_ns;
588 	uint64_t lto_abstime;
589 	uint32_t slto;
590 
591 	if (PE_parse_boot_argn("slto_us", &slto, sizeof(slto))) {
592 		lto_timeout_ns = slto * NSEC_PER_USEC;
593 		nanoseconds_to_absolutetime(lto_timeout_ns, &lto_abstime);
594 		os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
595 	} else {
596 		lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
597 		absolutetime_to_nanoseconds(lto_abstime, &lto_timeout_ns);
598 	}
599 
600 	os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
601 
602 	if (PE_parse_boot_argn("tlto_us", &slto, sizeof(slto))) {
603 		nanoseconds_to_absolutetime(slto * NSEC_PER_USEC, &lto_abstime);
604 		os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
605 	} else if (lto_abstime != 0) {
606 		os_atomic_store(&TLockTimeOut, lto_abstime >> 1, relaxed);
607 	} // else take default from MACHINE_TIMEOUT.
608 
609 	uint64_t mtxspin;
610 	uint64_t mtx_abstime;
611 	if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof(mtxspin))) {
612 		if (mtxspin > USEC_PER_SEC >> 4) {
613 			mtxspin =  USEC_PER_SEC >> 4;
614 		}
615 		nanoseconds_to_absolutetime(mtxspin * NSEC_PER_USEC, &mtx_abstime);
616 		os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
617 	} else {
618 		mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
619 	}
620 
621 	low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
622 	/*
623 	 * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
624 	 * real_ncpus is not set at this time
625 	 *
626 	 * NOTE: active spinning is disabled in arm. It can be activated
627 	 * by setting high_MutexSpin through the sysctl.
628 	 */
629 	high_MutexSpin = low_MutexSpin;
630 
631 	uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
632 	PE_parse_boot_argn("max_wfe_us", &maxwfeus, sizeof(maxwfeus));
633 	nanoseconds_to_absolutetime(maxwfeus * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
634 }
635 
636 /*
637  * This is called when all of the ml_processor_info_t structures have been
638  * initialized and all the processors have been started through processor_start().
639  *
640  * Required by the scheduler subsystem.
641  */
642 void
ml_cpu_init_completed(void)643 ml_cpu_init_completed(void)
644 {
645 	if (SCHED(cpu_init_completed) != NULL) {
646 		SCHED(cpu_init_completed)();
647 	}
648 }
649 
650 /*
651  * This is called from the machine-independent routine cpu_up()
652  * to perform machine-dependent info updates.
653  */
654 void
ml_cpu_up(void)655 ml_cpu_up(void)
656 {
657 	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[ml_get_cpu_number_local()];
658 
659 	os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
660 
661 	os_atomic_inc(&machine_info.physical_cpu, relaxed);
662 	os_atomic_inc(&machine_info.logical_cpu, relaxed);
663 }
664 
665 /*
666  * This is called from the machine-independent routine cpu_down()
667  * to perform machine-dependent info updates.
668  */
669 void
ml_cpu_down(void)670 ml_cpu_down(void)
671 {
672 	cpu_data_t      *cpu_data_ptr;
673 	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[ml_get_cpu_number_local()];
674 
675 	os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
676 
677 	os_atomic_dec(&machine_info.physical_cpu, relaxed);
678 	os_atomic_dec(&machine_info.logical_cpu, relaxed);
679 
680 	/*
681 	 * If we want to deal with outstanding IPIs, we need to
682 	 * do relatively early in the processor_doshutdown path,
683 	 * as we pend decrementer interrupts using the IPI
684 	 * mechanism if we cannot immediately service them (if
685 	 * IRQ is masked).  Do so now.
686 	 *
687 	 * We aren't on the interrupt stack here; would it make
688 	 * more sense to disable signaling and then enable
689 	 * interrupts?  It might be a bit cleaner.
690 	 */
691 	cpu_data_ptr = getCpuDatap();
692 	cpu_data_ptr->cpu_running = FALSE;
693 
694 	if (cpu_data_ptr != &BootCpuData) {
695 		/*
696 		 * Move all of this cpu's timers to the master/boot cpu,
697 		 * and poke it in case there's a sooner deadline for it to schedule.
698 		 */
699 		timer_queue_shutdown(&cpu_data_ptr->rtclock_timer.queue);
700 		kern_return_t rv = cpu_xcall(BootCpuData.cpu_number, &timer_queue_expire_local, &ml_cpu_down);
701 		if (rv != KERN_SUCCESS) {
702 			panic("ml_cpu_down: IPI failure %d", rv);
703 		}
704 	}
705 
706 	cpu_signal_handler_internal(TRUE);
707 }
708 
709 unsigned int
ml_get_machine_mem(void)710 ml_get_machine_mem(void)
711 {
712 	return machine_info.memory_size;
713 }
714 
715 __attribute__((noreturn))
716 void
halt_all_cpus(boolean_t reboot)717 halt_all_cpus(boolean_t reboot)
718 {
719 	if (reboot) {
720 		printf("MACH Reboot\n");
721 		PEHaltRestart(kPERestartCPU);
722 	} else {
723 		printf("CPU halted\n");
724 		PEHaltRestart(kPEHaltCPU);
725 	}
726 	while (1) {
727 		;
728 	}
729 }
730 
731 __attribute__((noreturn))
732 void
halt_cpu(void)733 halt_cpu(void)
734 {
735 	halt_all_cpus(FALSE);
736 }
737 
738 /*
739  *	Routine:        machine_signal_idle
740  *	Function:
741  */
742 void
machine_signal_idle(processor_t processor)743 machine_signal_idle(
744 	processor_t processor)
745 {
746 	cpu_signal(processor_to_cpu_datap(processor), SIGPnop, (void *)NULL, (void *)NULL);
747 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
748 }
749 
750 void
machine_signal_idle_deferred(processor_t processor)751 machine_signal_idle_deferred(
752 	processor_t processor)
753 {
754 	cpu_signal_deferred(processor_to_cpu_datap(processor));
755 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
756 }
757 
758 void
machine_signal_idle_cancel(processor_t processor)759 machine_signal_idle_cancel(
760 	processor_t processor)
761 {
762 	cpu_signal_cancel(processor_to_cpu_datap(processor));
763 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
764 }
765 
766 /*
767  *	Routine:        ml_install_interrupt_handler
768  *	Function:	Initialize Interrupt Handler
769  */
770 void
ml_install_interrupt_handler(void * nub,int source,void * target,IOInterruptHandler handler,void * refCon)771 ml_install_interrupt_handler(
772 	void *nub,
773 	int source,
774 	void *target,
775 	IOInterruptHandler handler,
776 	void *refCon)
777 {
778 	cpu_data_t     *cpu_data_ptr;
779 	boolean_t       current_state;
780 
781 	current_state = ml_set_interrupts_enabled(FALSE);
782 	cpu_data_ptr = getCpuDatap();
783 
784 	cpu_data_ptr->interrupt_nub = nub;
785 	cpu_data_ptr->interrupt_source = source;
786 	cpu_data_ptr->interrupt_target = target;
787 	cpu_data_ptr->interrupt_handler = handler;
788 	cpu_data_ptr->interrupt_refCon = refCon;
789 
790 	(void) ml_set_interrupts_enabled(current_state);
791 }
792 
793 /*
794  *	Routine:        ml_init_interrupt
795  *	Function:	Initialize Interrupts
796  */
797 void
ml_init_interrupt(void)798 ml_init_interrupt(void)
799 {
800 #if defined(HAS_IPI)
801 	/*
802 	 * ml_init_interrupt will get called once for each CPU, but this is redundant
803 	 * because there is only one global copy of the register for skye. do it only
804 	 * on the bootstrap cpu
805 	 */
806 	if (getCpuDatap()->cluster_master) {
807 		ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
808 	}
809 #endif
810 }
811 
812 /*
813  *	Routine:        ml_init_timebase
814  *	Function:	register and setup Timebase, Decremeter services
815  */
816 void
ml_init_timebase(void * args,tbd_ops_t tbd_funcs,vm_offset_t int_address,vm_offset_t int_value __unused)817 ml_init_timebase(
818 	void            *args,
819 	tbd_ops_t       tbd_funcs,
820 	vm_offset_t     int_address,
821 	vm_offset_t     int_value __unused)
822 {
823 	cpu_data_t     *cpu_data_ptr;
824 
825 	cpu_data_ptr = (cpu_data_t *)args;
826 
827 	if ((cpu_data_ptr == &BootCpuData)
828 	    && (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
829 		rtclock_timebase_func = *tbd_funcs;
830 		rtclock_timebase_addr = int_address;
831 	}
832 }
833 
834 #define ML_READPROP_MANDATORY UINT64_MAX
835 
836 static uint64_t
ml_readprop(const DTEntry entry,const char * propertyName,uint64_t default_value)837 ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
838 {
839 	void const *prop;
840 	unsigned int propSize;
841 
842 	if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) {
843 		if (propSize == sizeof(uint8_t)) {
844 			return *((uint8_t const *)prop);
845 		} else if (propSize == sizeof(uint16_t)) {
846 			return *((uint16_t const *)prop);
847 		} else if (propSize == sizeof(uint32_t)) {
848 			return *((uint32_t const *)prop);
849 		} else if (propSize == sizeof(uint64_t)) {
850 			return *((uint64_t const *)prop);
851 		} else {
852 			panic("CPU property '%s' has bad size %u", propertyName, propSize);
853 		}
854 	} else {
855 		if (default_value == ML_READPROP_MANDATORY) {
856 			panic("Missing mandatory property '%s'", propertyName);
857 		}
858 		return default_value;
859 	}
860 }
861 
862 static boolean_t
ml_read_reg_range(const DTEntry entry,const char * propertyName,uint64_t * pa_ptr,uint64_t * len_ptr)863 ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
864 {
865 	uint64_t const *prop;
866 	unsigned int propSize;
867 
868 	if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) {
869 		return FALSE;
870 	}
871 
872 	if (propSize != sizeof(uint64_t) * 2) {
873 		panic("Wrong property size for %s", propertyName);
874 	}
875 
876 	*pa_ptr = prop[0];
877 	*len_ptr = prop[1];
878 	return TRUE;
879 }
880 
881 static boolean_t
ml_is_boot_cpu(const DTEntry entry)882 ml_is_boot_cpu(const DTEntry entry)
883 {
884 	void const *prop;
885 	unsigned int propSize;
886 
887 	if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) {
888 		panic("unable to retrieve state for cpu");
889 	}
890 
891 	if (strncmp((char const *)prop, "running", propSize) == 0) {
892 		return TRUE;
893 	} else {
894 		return FALSE;
895 	}
896 }
897 
898 static void
ml_read_chip_revision(unsigned int * rev __unused)899 ml_read_chip_revision(unsigned int *rev __unused)
900 {
901 	// The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
902 #ifdef APPLE_ARM64_ARCH_FAMILY
903 	DTEntry         entryP;
904 
905 	if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) {
906 		*rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN);
907 	} else {
908 		*rev = CPU_VERSION_UNKNOWN;
909 	}
910 #endif
911 }
912 
913 void
ml_parse_cpu_topology(void)914 ml_parse_cpu_topology(void)
915 {
916 	DTEntry entry, child __unused;
917 	OpaqueDTEntryIterator iter;
918 	uint32_t cpu_boot_arg = MAX_CPUS;
919 	uint64_t cpumask_boot_arg = ULLONG_MAX;
920 	int err;
921 
922 	int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
923 	int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
924 	const boolean_t cpus_boot_arg_present = PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg));
925 	const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn("cpumask", &cpumask_boot_arg, sizeof(cpumask_boot_arg));
926 
927 	// The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
928 	// so that we trigger a panic later in the boot process, once serial is enabled.
929 	if (cpus_boot_arg_present && cpumask_boot_arg_present) {
930 		cpu_config_correct = false;
931 	}
932 
933 	err = SecureDTLookupEntry(NULL, "/cpus", &entry);
934 	assert(err == kSuccess);
935 
936 	err = SecureDTInitEntryIterator(entry, &iter);
937 	assert(err == kSuccess);
938 
939 	for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
940 		cluster_offsets[i] = -1;
941 		cluster_phys_to_logical[i] = -1;
942 		cluster_max_cpu_phys_id[i] = 0;
943 	}
944 
945 	while (kSuccess == SecureDTIterateEntries(&iter, &child)) {
946 		boolean_t is_boot_cpu = ml_is_boot_cpu(child);
947 		boolean_t cpu_enabled = cpumask_boot_arg & 1;
948 		cpumask_boot_arg >>= 1;
949 
950 		// Boot CPU disabled in cpumask. Flag this so that we trigger a panic
951 		// later in the boot process, once serial is enabled.
952 		if (is_boot_cpu && !cpu_enabled) {
953 			cpu_config_correct = false;
954 		}
955 
956 		// Ignore this CPU if it has been disabled by the cpumask= boot-arg.
957 		if (!is_boot_cpu && !cpu_enabled) {
958 			continue;
959 		}
960 
961 		// If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
962 		// been added to the topology struct yet, and we only have one slot left, then skip
963 		// every other non-boot CPU in order to leave room for the boot CPU.
964 		//
965 		// e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
966 		// array will list CPU0, CPU1, and CPU4.  CPU2-CPU3 and CPU5-CPUn will be omitted.
967 		if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
968 			continue;
969 		}
970 		if (topology_info.num_cpus >= cpu_boot_arg) {
971 			break;
972 		}
973 
974 		ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
975 
976 		cpu->cpu_id = topology_info.num_cpus++;
977 		assert(cpu->cpu_id < MAX_CPUS);
978 		topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
979 
980 		cpu->reserved = 0;
981 		topology_info.reserved = 0;
982 
983 		cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY);
984 
985 		cpu->l2_access_penalty = (uint32_t)ml_readprop(child, "l2-access-penalty", 0);
986 		cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0);
987 		cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0);
988 		cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
989 		cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
990 
991 		ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
992 		ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
993 		ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
994 		cpu->cluster_type = CLUSTER_TYPE_SMP;
995 
996 		int cluster_type = (int)ml_readprop(child, "cluster-type", 0);
997 		if (cluster_type == 'E') {
998 			cpu->cluster_type = CLUSTER_TYPE_E;
999 		} else if (cluster_type == 'P') {
1000 			cpu->cluster_type = CLUSTER_TYPE_P;
1001 		}
1002 
1003 		topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1004 
1005 		/*
1006 		 * Since we want to keep a linear cluster ID space, we cannot just rely
1007 		 * on the value provided by EDT. Instead, use the MPIDR value to see if we have
1008 		 * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1009 		 */
1010 #if HAS_CLUSTER
1011 		uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1012 #else
1013 		uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1014 #endif
1015 		assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1016 		cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
1017 		    topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1018 
1019 		assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1020 
1021 		ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1022 		if (cluster->num_cpus == 0) {
1023 			assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1024 
1025 			topology_info.num_clusters++;
1026 			topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1027 			topology_info.cluster_types |= (1 << cpu->cluster_type);
1028 
1029 			cluster->cluster_id = cpu->cluster_id;
1030 			cluster->cluster_type = cpu->cluster_type;
1031 			cluster->first_cpu_id = cpu->cpu_id;
1032 			assert(cluster_phys_to_logical[phys_cluster_id] == -1);
1033 			cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1034 
1035 			// Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1036 			// If we wind up with a bunch of these, we might want to create separate per-cluster
1037 			// EDT nodes and have the CPU nodes reference them through a phandle.
1038 			ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len);
1039 			ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len);
1040 		}
1041 
1042 #if HAS_CLUSTER
1043 		if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1044 			cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1045 		}
1046 #endif
1047 
1048 		cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1049 		cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1050 
1051 		cluster->num_cpus++;
1052 		cluster->cpu_mask |= 1ULL << cpu->cpu_id;
1053 
1054 		if (is_boot_cpu) {
1055 			assert(topology_info.boot_cpu == NULL);
1056 			topology_info.boot_cpu = cpu;
1057 			topology_info.boot_cluster = cluster;
1058 		}
1059 	}
1060 
1061 #if HAS_CLUSTER
1062 	/*
1063 	 * Build the cluster offset array, ensuring that the region reserved
1064 	 * for each physical cluster contains enough entries to be indexed
1065 	 * by the maximum physical CPU ID (AFF0) within the cluster.
1066 	 */
1067 	unsigned int cur_cluster_offset = 0;
1068 	for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1069 		if (cluster_phys_to_logical[i] != -1) {
1070 			cluster_offsets[i] = cur_cluster_offset;
1071 			cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
1072 		}
1073 	}
1074 	assert(cur_cluster_offset <= MAX_CPUS);
1075 #else
1076 	/*
1077 	 * For H10, there are really 2 physical clusters, but they are not separated
1078 	 * into distinct ACCs.  AFF1 therefore always reports 0, and AFF0 numbering
1079 	 * is linear across both clusters.   For the purpose of MPIDR_EL1-based indexing,
1080 	 * treat H10 and earlier devices as though they contain a single cluster.
1081 	 */
1082 	cluster_offsets[0] = 0;
1083 #endif
1084 	assert(topology_info.boot_cpu != NULL);
1085 	ml_read_chip_revision(&topology_info.chip_revision);
1086 
1087 	/*
1088 	 * Set TPIDR_EL0 to indicate the correct cpu number, as we may
1089 	 * not be booting from cpu 0.  Userspace will consume the current
1090 	 * CPU number through this register.  For non-boot cores, this is
1091 	 * done in start.s (start_cpu) using the cpu_number field of the
1092 	 * per-cpu data object.
1093 	 */
1094 	uint64_t cpuid = topology_info.boot_cpu->cpu_id;
1095 
1096 	__builtin_arm_wsr64("TPIDR_EL0", cpuid & MACHDEP_TPIDR_CPUNUM_MASK);
1097 	assert((cpuid & MACHDEP_TPIDR_CPUNUM_MASK) == cpuid);
1098 	__builtin_arm_wsr64("TPIDRRO_EL0", 0);
1099 }
1100 
1101 const ml_topology_info_t *
ml_get_topology_info(void)1102 ml_get_topology_info(void)
1103 {
1104 	return &topology_info;
1105 }
1106 
1107 void
ml_map_cpu_pio(void)1108 ml_map_cpu_pio(void)
1109 {
1110 	unsigned int i;
1111 
1112 	for (i = 0; i < topology_info.num_cpus; i++) {
1113 		ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1114 		if (cpu->cpu_IMPL_pa) {
1115 			cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len);
1116 			cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len);
1117 		}
1118 		if (cpu->cpu_UTTDBG_pa) {
1119 			cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len);
1120 		}
1121 	}
1122 
1123 	for (i = 0; i < topology_info.num_clusters; i++) {
1124 		ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1125 		if (cluster->acc_IMPL_pa) {
1126 			cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len);
1127 		}
1128 		if (cluster->cpm_IMPL_pa) {
1129 			cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len);
1130 		}
1131 	}
1132 }
1133 
1134 unsigned int
ml_get_cpu_count(void)1135 ml_get_cpu_count(void)
1136 {
1137 	return topology_info.num_cpus;
1138 }
1139 
1140 unsigned int
ml_get_cluster_count(void)1141 ml_get_cluster_count(void)
1142 {
1143 	return topology_info.num_clusters;
1144 }
1145 
1146 int
ml_get_boot_cpu_number(void)1147 ml_get_boot_cpu_number(void)
1148 {
1149 	return topology_info.boot_cpu->cpu_id;
1150 }
1151 
1152 cluster_type_t
ml_get_boot_cluster_type(void)1153 ml_get_boot_cluster_type(void)
1154 {
1155 	return topology_info.boot_cluster->cluster_type;
1156 }
1157 
1158 int
ml_get_cpu_number(uint32_t phys_id)1159 ml_get_cpu_number(uint32_t phys_id)
1160 {
1161 	phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
1162 
1163 	for (unsigned i = 0; i < topology_info.num_cpus; i++) {
1164 		if (topology_info.cpus[i].phys_id == phys_id) {
1165 			return i;
1166 		}
1167 	}
1168 
1169 	return -1;
1170 }
1171 
1172 int
ml_get_cluster_number(uint32_t phys_id)1173 ml_get_cluster_number(uint32_t phys_id)
1174 {
1175 	int cpu_id = ml_get_cpu_number(phys_id);
1176 	if (cpu_id < 0) {
1177 		return -1;
1178 	}
1179 
1180 	ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1181 
1182 	return cpu->cluster_id;
1183 }
1184 
1185 unsigned int
ml_get_cpu_number_local(void)1186 ml_get_cpu_number_local(void)
1187 {
1188 	uint64_t mpidr_el1_value = 0;
1189 	unsigned cpu_id;
1190 
1191 	/* We identify the CPU based on the constant bits of MPIDR_EL1. */
1192 	MRS(mpidr_el1_value, "MPIDR_EL1");
1193 	cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value);
1194 
1195 	assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1196 
1197 	return cpu_id;
1198 }
1199 
1200 int
ml_get_cluster_number_local()1201 ml_get_cluster_number_local()
1202 {
1203 	uint64_t mpidr_el1_value = 0;
1204 	unsigned cluster_id;
1205 
1206 	/* We identify the cluster based on the constant bits of MPIDR_EL1. */
1207 	MRS(mpidr_el1_value, "MPIDR_EL1");
1208 	cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value);
1209 
1210 	assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1211 
1212 	return cluster_id;
1213 }
1214 
1215 int
ml_get_max_cpu_number(void)1216 ml_get_max_cpu_number(void)
1217 {
1218 	return topology_info.max_cpu_id;
1219 }
1220 
1221 int
ml_get_max_cluster_number(void)1222 ml_get_max_cluster_number(void)
1223 {
1224 	return topology_info.max_cluster_id;
1225 }
1226 
1227 unsigned int
ml_get_first_cpu_id(unsigned int cluster_id)1228 ml_get_first_cpu_id(unsigned int cluster_id)
1229 {
1230 	return topology_info.clusters[cluster_id].first_cpu_id;
1231 }
1232 
1233 
1234 void
ml_lockdown_init()1235 ml_lockdown_init()
1236 {
1237 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
1238 	rorgn_stash_range();
1239 #endif
1240 }
1241 
1242 kern_return_t
ml_lockdown_handler_register(lockdown_handler_t f,void * this)1243 ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1244 {
1245 	if (!f) {
1246 		return KERN_FAILURE;
1247 	}
1248 
1249 	assert(lockdown_done);
1250 	f(this); // XXX: f this whole function
1251 
1252 	return KERN_SUCCESS;
1253 }
1254 
1255 kern_return_t
ml_processor_register(ml_processor_info_t * in_processor_info,processor_t * processor_out,ipi_handler_t * ipi_handler_out,perfmon_interrupt_handler_func * pmi_handler_out)1256 ml_processor_register(ml_processor_info_t *in_processor_info,
1257     processor_t *processor_out, ipi_handler_t *ipi_handler_out,
1258     perfmon_interrupt_handler_func *pmi_handler_out)
1259 {
1260 	cpu_data_t *this_cpu_datap;
1261 	processor_set_t pset;
1262 	boolean_t  is_boot_cpu;
1263 	static unsigned int reg_cpu_count = 0;
1264 
1265 	if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1266 		return KERN_FAILURE;
1267 	}
1268 
1269 	if ((unsigned)OSIncrementAtomic((SInt32*)&reg_cpu_count) >= topology_info.num_cpus) {
1270 		return KERN_FAILURE;
1271 	}
1272 
1273 	if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1274 		is_boot_cpu = FALSE;
1275 		this_cpu_datap = cpu_data_alloc(FALSE);
1276 		cpu_data_init(this_cpu_datap);
1277 	} else {
1278 		this_cpu_datap = &BootCpuData;
1279 		is_boot_cpu = TRUE;
1280 	}
1281 
1282 	assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1283 
1284 	this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1285 
1286 	if (!is_boot_cpu) {
1287 		this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1288 
1289 		if (cpu_data_register(this_cpu_datap) != KERN_SUCCESS) {
1290 			goto processor_register_error;
1291 		}
1292 		assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1293 	}
1294 
1295 	this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1296 	this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1297 	nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency);
1298 	this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr);
1299 
1300 	this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1301 	this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1302 
1303 	this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1304 	this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1305 	this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1306 	this_cpu_datap->cpu_l2_access_penalty = in_processor_info->l2_access_penalty;
1307 
1308 	this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1309 	this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1310 	this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1311 	this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1312 	this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1313 	this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1314 
1315 #if HAS_CLUSTER
1316 	this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1317 #else /* HAS_CLUSTER */
1318 	this_cpu_datap->cluster_master = is_boot_cpu;
1319 #endif /* HAS_CLUSTER */
1320 	pset = pset_find(in_processor_info->cluster_id, NULL);
1321 	if (pset == NULL) {
1322 #if __AMP__
1323 		pset_cluster_type_t pset_cluster_type = this_cpu_datap->cpu_cluster_type == CLUSTER_TYPE_E ? PSET_AMP_E : PSET_AMP_P;
1324 		pset = pset_create(ml_get_boot_cluster_type() == this_cpu_datap->cpu_cluster_type ? &pset_node0 : &pset_node1, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1325 		assert(pset != PROCESSOR_SET_NULL);
1326 #else /* __AMP__ */
1327 		pset_cluster_type_t pset_cluster_type = PSET_SMP;
1328 		pset = pset_create(&pset_node0, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1329 		assert(pset != PROCESSOR_SET_NULL);
1330 #endif /* __AMP__ */
1331 	}
1332 	kprintf("%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1333 
1334 	processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1335 	if (!is_boot_cpu) {
1336 		processor_init(processor, this_cpu_datap->cpu_number, pset);
1337 
1338 		if (this_cpu_datap->cpu_l2_access_penalty) {
1339 			/*
1340 			 * Cores that have a non-zero L2 access penalty compared
1341 			 * to the boot processor should be de-prioritized by the
1342 			 * scheduler, so that threads use the cores with better L2
1343 			 * preferentially.
1344 			 */
1345 			processor_set_primary(processor, master_processor);
1346 		}
1347 	}
1348 
1349 	*processor_out = processor;
1350 	*ipi_handler_out = cpu_signal_handler;
1351 #if CPMU_AIC_PMI && MONOTONIC
1352 	*pmi_handler_out = mt_cpmu_aic_pmi;
1353 #else
1354 	*pmi_handler_out = NULL;
1355 #endif /* CPMU_AIC_PMI && MONOTONIC */
1356 	if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1357 		*in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1358 	}
1359 
1360 #if KPC
1361 	if (kpc_register_cpu(this_cpu_datap) != TRUE) {
1362 		goto processor_register_error;
1363 	}
1364 #endif /* KPC */
1365 
1366 	if (!is_boot_cpu) {
1367 		random_cpu_init(this_cpu_datap->cpu_number);
1368 		// now let next CPU register itself
1369 		OSIncrementAtomic((SInt32*)&real_ncpus);
1370 	}
1371 
1372 	return KERN_SUCCESS;
1373 
1374 processor_register_error:
1375 #if KPC
1376 	kpc_unregister_cpu(this_cpu_datap);
1377 #endif /* KPC */
1378 	if (!is_boot_cpu) {
1379 		cpu_data_free(this_cpu_datap);
1380 	}
1381 
1382 	return KERN_FAILURE;
1383 }
1384 
1385 void
ml_init_arm_debug_interface(void * in_cpu_datap,vm_offset_t virt_address)1386 ml_init_arm_debug_interface(
1387 	void * in_cpu_datap,
1388 	vm_offset_t virt_address)
1389 {
1390 	((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1391 	do_debugid();
1392 }
1393 
1394 /*
1395  *	Routine:        init_ast_check
1396  *	Function:
1397  */
1398 void
init_ast_check(__unused processor_t processor)1399 init_ast_check(
1400 	__unused processor_t processor)
1401 {
1402 }
1403 
1404 /*
1405  *	Routine:        cause_ast_check
1406  *	Function:
1407  */
1408 void
cause_ast_check(processor_t processor)1409 cause_ast_check(
1410 	processor_t processor)
1411 {
1412 	if (current_processor() != processor) {
1413 		cpu_signal(processor_to_cpu_datap(processor), SIGPast, (void *)NULL, (void *)NULL);
1414 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 1 /* ast */, 0, 0, 0);
1415 	}
1416 }
1417 
1418 extern uint32_t cpu_idle_count;
1419 
1420 void
ml_get_power_state(boolean_t * icp,boolean_t * pidlep)1421 ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
1422 {
1423 	*icp = ml_at_interrupt_context();
1424 	*pidlep = (cpu_idle_count == real_ncpus);
1425 }
1426 
1427 /*
1428  *	Routine:        ml_cause_interrupt
1429  *	Function:	Generate a fake interrupt
1430  */
1431 void
ml_cause_interrupt(void)1432 ml_cause_interrupt(void)
1433 {
1434 	return;                 /* BS_XXX */
1435 }
1436 
1437 /* Map memory map IO space */
1438 vm_offset_t
ml_io_map(vm_offset_t phys_addr,vm_size_t size)1439 ml_io_map(
1440 	vm_offset_t phys_addr,
1441 	vm_size_t size)
1442 {
1443 	return io_map(phys_addr, size, VM_WIMG_IO);
1444 }
1445 
1446 /* Map memory map IO space (with protections specified) */
1447 vm_offset_t
ml_io_map_with_prot(vm_offset_t phys_addr,vm_size_t size,vm_prot_t prot)1448 ml_io_map_with_prot(
1449 	vm_offset_t phys_addr,
1450 	vm_size_t size,
1451 	vm_prot_t prot)
1452 {
1453 	return io_map_with_prot(phys_addr, size, VM_WIMG_IO, prot);
1454 }
1455 
1456 vm_offset_t
ml_io_map_wcomb(vm_offset_t phys_addr,vm_size_t size)1457 ml_io_map_wcomb(
1458 	vm_offset_t phys_addr,
1459 	vm_size_t size)
1460 {
1461 	return io_map(phys_addr, size, VM_WIMG_WCOMB);
1462 }
1463 
1464 void
ml_io_unmap(vm_offset_t addr,vm_size_t sz)1465 ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1466 {
1467 	pmap_remove(kernel_pmap, addr, addr + sz);
1468 	kmem_free(kernel_map, addr, sz);
1469 }
1470 
1471 /* boot memory allocation */
1472 vm_offset_t
ml_static_malloc(__unused vm_size_t size)1473 ml_static_malloc(
1474 	__unused vm_size_t size)
1475 {
1476 	return (vm_offset_t) NULL;
1477 }
1478 
1479 vm_map_address_t
ml_map_high_window(vm_offset_t phys_addr,vm_size_t len)1480 ml_map_high_window(
1481 	vm_offset_t     phys_addr,
1482 	vm_size_t       len)
1483 {
1484 	return pmap_map_high_window_bd(phys_addr, len, VM_PROT_READ | VM_PROT_WRITE);
1485 }
1486 
1487 vm_offset_t
ml_static_ptovirt(vm_offset_t paddr)1488 ml_static_ptovirt(
1489 	vm_offset_t paddr)
1490 {
1491 	return phystokv(paddr);
1492 }
1493 
1494 vm_offset_t
ml_static_slide(vm_offset_t vaddr)1495 ml_static_slide(
1496 	vm_offset_t vaddr)
1497 {
1498 	vm_offset_t slid_vaddr = vaddr + vm_kernel_slide;
1499 
1500 	if ((slid_vaddr < vm_kernelcache_base) || (slid_vaddr >= vm_kernelcache_top)) {
1501 		/* This is only intended for use on kernelcache addresses. */
1502 		return 0;
1503 	}
1504 
1505 	/*
1506 	 * Because the address is in the kernelcache, we can do a simple
1507 	 * slide calculation.
1508 	 */
1509 	return slid_vaddr;
1510 }
1511 
1512 vm_offset_t
ml_static_unslide(vm_offset_t vaddr)1513 ml_static_unslide(
1514 	vm_offset_t vaddr)
1515 {
1516 	if ((vaddr < vm_kernelcache_base) || (vaddr >= vm_kernelcache_top)) {
1517 		/* This is only intended for use on kernelcache addresses. */
1518 		return 0;
1519 	}
1520 
1521 	return vaddr - vm_kernel_slide;
1522 }
1523 
1524 extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1525 
1526 kern_return_t
ml_static_protect(vm_offset_t vaddr,vm_size_t size,vm_prot_t new_prot)1527 ml_static_protect(
1528 	vm_offset_t vaddr, /* kernel virtual address */
1529 	vm_size_t size,
1530 	vm_prot_t new_prot)
1531 {
1532 	pt_entry_t    arm_prot = 0;
1533 	pt_entry_t    arm_block_prot = 0;
1534 	vm_offset_t   vaddr_cur;
1535 	ppnum_t       ppn;
1536 	kern_return_t result = KERN_SUCCESS;
1537 
1538 	if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1539 		panic("ml_static_protect(): %p < %p", (void *) vaddr, (void *) VM_MIN_KERNEL_ADDRESS);
1540 		return KERN_FAILURE;
1541 	}
1542 
1543 	assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1544 
1545 	if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1546 		panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1547 	}
1548 	if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1549 		panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1550 	}
1551 
1552 	/* Set up the protection bits, and block bits so we can validate block mappings. */
1553 	if (new_prot & VM_PROT_WRITE) {
1554 		arm_prot |= ARM_PTE_AP(AP_RWNA);
1555 		arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RWNA);
1556 	} else {
1557 		arm_prot |= ARM_PTE_AP(AP_RONA);
1558 		arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RONA);
1559 	}
1560 
1561 	arm_prot |= ARM_PTE_NX;
1562 	arm_block_prot |= ARM_TTE_BLOCK_NX;
1563 
1564 	if (!(new_prot & VM_PROT_EXECUTE)) {
1565 		arm_prot |= ARM_PTE_PNX;
1566 		arm_block_prot |= ARM_TTE_BLOCK_PNX;
1567 	}
1568 
1569 	for (vaddr_cur = vaddr;
1570 	    vaddr_cur < trunc_page_64(vaddr + size);
1571 	    vaddr_cur += PAGE_SIZE) {
1572 		ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1573 		if (ppn != (vm_offset_t) NULL) {
1574 			tt_entry_t      *tte2;
1575 			pt_entry_t      *pte_p;
1576 			pt_entry_t      ptmp;
1577 
1578 #if XNU_MONITOR
1579 			assert(!pmap_is_monitor(ppn));
1580 			assert(!TEST_PAGE_RATIO_4);
1581 #endif
1582 
1583 			tte2 = arm_kva_to_tte(vaddr_cur);
1584 
1585 			if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1586 				if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1587 				    ((*tte2 & (ARM_TTE_BLOCK_NXMASK | ARM_TTE_BLOCK_PNXMASK | ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1588 					/*
1589 					 * We can support ml_static_protect on a block mapping if the mapping already has
1590 					 * the desired protections.  We still want to run checks on a per-page basis.
1591 					 */
1592 					continue;
1593 				}
1594 
1595 				result = KERN_FAILURE;
1596 				break;
1597 			}
1598 
1599 			pte_p = (pt_entry_t *)&((tt_entry_t*)(phystokv((*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1600 			ptmp = *pte_p;
1601 
1602 			if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot)) {
1603 				/*
1604 				 * The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
1605 				 * protections do not match the desired protections, then we will fail (as we cannot update
1606 				 * this mapping without updating other mappings as well).
1607 				 */
1608 				result = KERN_FAILURE;
1609 				break;
1610 			}
1611 
1612 			__unreachable_ok_push
1613 			if (TEST_PAGE_RATIO_4) {
1614 				{
1615 					unsigned int    i;
1616 					pt_entry_t      *ptep_iter;
1617 
1618 					ptep_iter = pte_p;
1619 					for (i = 0; i < 4; i++, ptep_iter++) {
1620 						/* Note that there is a hole in the HINT sanity checking here. */
1621 						ptmp = *ptep_iter;
1622 
1623 						/* We only need to update the page tables if the protections do not match. */
1624 						if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1625 							ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1626 							*ptep_iter = ptmp;
1627 						}
1628 					}
1629 				}
1630 			} else {
1631 				ptmp = *pte_p;
1632 				/* We only need to update the page tables if the protections do not match. */
1633 				if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1634 					ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1635 					*pte_p = ptmp;
1636 				}
1637 			}
1638 			__unreachable_ok_pop
1639 		}
1640 	}
1641 
1642 	if (vaddr_cur > vaddr) {
1643 		assert(((vaddr_cur - vaddr) & 0xFFFFFFFF00000000ULL) == 0);
1644 		flush_mmu_tlb_region(vaddr, (uint32_t)(vaddr_cur - vaddr));
1645 	}
1646 
1647 
1648 	return result;
1649 }
1650 
1651 /*
1652  *	Routine:        ml_static_mfree
1653  *	Function:
1654  */
1655 void
ml_static_mfree(vm_offset_t vaddr,vm_size_t size)1656 ml_static_mfree(
1657 	vm_offset_t vaddr,
1658 	vm_size_t   size)
1659 {
1660 	vm_offset_t vaddr_cur;
1661 	ppnum_t     ppn;
1662 	uint32_t    freed_pages = 0;
1663 	uint32_t    bad_page_cnt = 0;
1664 	uint32_t    freed_kernelcache_pages = 0;
1665 
1666 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
1667 	/* For testing hitting a bad ram page */
1668 	static int count = 0;
1669 	static int bad_at_cnt = -1;
1670 	static bool first = true;
1671 
1672 	if (first) {
1673 		(void)PE_parse_boot_argn("bad_static_mfree", &bad_at_cnt, sizeof(bad_at_cnt));
1674 		first = false;
1675 	}
1676 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
1677 
1678 	/* It is acceptable (if bad) to fail to free. */
1679 	if (vaddr < VM_MIN_KERNEL_ADDRESS) {
1680 		return;
1681 	}
1682 
1683 	assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1684 
1685 	for (vaddr_cur = vaddr;
1686 	    vaddr_cur < trunc_page_64(vaddr + size);
1687 	    vaddr_cur += PAGE_SIZE) {
1688 		ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
1689 		if (ppn != (vm_offset_t) NULL) {
1690 			/*
1691 			 * It is not acceptable to fail to update the protections on a page
1692 			 * we will release to the VM.  We need to either panic or continue.
1693 			 * For now, we'll panic (to help flag if there is memory we can
1694 			 * reclaim).
1695 			 */
1696 			if (ml_static_protect(vaddr_cur, PAGE_SIZE, VM_PROT_WRITE | VM_PROT_READ) != KERN_SUCCESS) {
1697 				panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
1698 			}
1699 
1700 #if defined(__arm64__)
1701 			bool is_bad = pmap_is_bad_ram(ppn);
1702 #if DEVELOPMENT || DEBUG
1703 			is_bad |= (count++ == bad_at_cnt);
1704 #endif /* DEVELOPMENT || DEBUG */
1705 
1706 			if (is_bad) {
1707 				++bad_page_cnt;
1708 				vm_page_create_retired(ppn);
1709 				continue;
1710 			}
1711 #endif /* defined(__arm64__) */
1712 
1713 			vm_page_create(ppn, (ppn + 1));
1714 			freed_pages++;
1715 			if (vaddr_cur >= segLOWEST && vaddr_cur < end_kern) {
1716 				freed_kernelcache_pages++;
1717 			}
1718 		}
1719 	}
1720 	vm_page_lockspin_queues();
1721 	vm_page_wire_count -= freed_pages;
1722 	vm_page_wire_count_initial -= freed_pages;
1723 	vm_page_kernelcache_count -= freed_kernelcache_pages;
1724 	vm_page_unlock_queues();
1725 #if     DEBUG
1726 	kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
1727 #endif
1728 }
1729 
1730 
1731 /* virtual to physical on wired pages */
1732 vm_offset_t
ml_vtophys(vm_offset_t vaddr)1733 ml_vtophys(vm_offset_t vaddr)
1734 {
1735 	return kvtophys(vaddr);
1736 }
1737 
1738 /*
1739  * Routine: ml_nofault_copy
1740  * Function: Perform a physical mode copy if the source and destination have
1741  * valid translations in the kernel pmap. If translations are present, they are
1742  * assumed to be wired; e.g., no attempt is made to guarantee that the
1743  * translations obtained remain valid for the duration of the copy process.
1744  */
1745 vm_size_t
ml_nofault_copy(vm_offset_t virtsrc,vm_offset_t virtdst,vm_size_t size)1746 ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
1747 {
1748 	addr64_t        cur_phys_dst, cur_phys_src;
1749 	vm_size_t       count, nbytes = 0;
1750 
1751 	while (size > 0) {
1752 		if (!(cur_phys_src = kvtophys(virtsrc))) {
1753 			break;
1754 		}
1755 		if (!(cur_phys_dst = kvtophys(virtdst))) {
1756 			break;
1757 		}
1758 		if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) ||
1759 		    !pmap_valid_address(trunc_page_64(cur_phys_src))) {
1760 			break;
1761 		}
1762 		count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
1763 		if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
1764 			count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
1765 		}
1766 		if (count > size) {
1767 			count = size;
1768 		}
1769 
1770 		bcopy_phys(cur_phys_src, cur_phys_dst, count);
1771 
1772 		nbytes += count;
1773 		virtsrc += count;
1774 		virtdst += count;
1775 		size -= count;
1776 	}
1777 
1778 	return nbytes;
1779 }
1780 
1781 /*
1782  *	Routine:        ml_validate_nofault
1783  *	Function: Validate that ths address range has a valid translations
1784  *			in the kernel pmap.  If translations are present, they are
1785  *			assumed to be wired; i.e. no attempt is made to guarantee
1786  *			that the translation persist after the check.
1787  *  Returns: TRUE if the range is mapped and will not cause a fault,
1788  *			FALSE otherwise.
1789  */
1790 
1791 boolean_t
ml_validate_nofault(vm_offset_t virtsrc,vm_size_t size)1792 ml_validate_nofault(
1793 	vm_offset_t virtsrc, vm_size_t size)
1794 {
1795 	addr64_t cur_phys_src;
1796 	uint32_t count;
1797 
1798 	while (size > 0) {
1799 		if (!(cur_phys_src = kvtophys(virtsrc))) {
1800 			return FALSE;
1801 		}
1802 		if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
1803 			return FALSE;
1804 		}
1805 		count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
1806 		if (count > size) {
1807 			count = (uint32_t)size;
1808 		}
1809 
1810 		virtsrc += count;
1811 		size -= count;
1812 	}
1813 
1814 	return TRUE;
1815 }
1816 
1817 void
ml_get_bouncepool_info(vm_offset_t * phys_addr,vm_size_t * size)1818 ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
1819 {
1820 	*phys_addr = 0;
1821 	*size = 0;
1822 }
1823 
1824 void
active_rt_threads(__unused boolean_t active)1825 active_rt_threads(__unused boolean_t active)
1826 {
1827 }
1828 
1829 static void
cpu_qos_cb_default(__unused int urgency,__unused uint64_t qos_param1,__unused uint64_t qos_param2)1830 cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
1831 {
1832 	return;
1833 }
1834 
1835 cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
1836 
1837 void
cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)1838 cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
1839 {
1840 	if (cpu_qos_cb != NULL) {
1841 		cpu_qos_update = cpu_qos_cb;
1842 	} else {
1843 		cpu_qos_update = cpu_qos_cb_default;
1844 	}
1845 }
1846 
1847 void
thread_tell_urgency(thread_urgency_t urgency,uint64_t rt_period,uint64_t rt_deadline,uint64_t sched_latency __unused,__unused thread_t nthread)1848 thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
1849 {
1850 	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
1851 
1852 	cpu_qos_update((int)urgency, rt_period, rt_deadline);
1853 
1854 	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
1855 }
1856 
1857 void
machine_run_count(__unused uint32_t count)1858 machine_run_count(__unused uint32_t count)
1859 {
1860 }
1861 
1862 processor_t
machine_choose_processor(__unused processor_set_t pset,processor_t processor)1863 machine_choose_processor(__unused processor_set_t pset, processor_t processor)
1864 {
1865 	return processor;
1866 }
1867 
1868 #if KASAN
1869 vm_offset_t ml_stack_base(void);
1870 vm_size_t ml_stack_size(void);
1871 
1872 vm_offset_t
ml_stack_base(void)1873 ml_stack_base(void)
1874 {
1875 	uintptr_t local = (uintptr_t) &local;
1876 	vm_offset_t     intstack_top_ptr;
1877 
1878 	intstack_top_ptr = getCpuDatap()->intstack_top;
1879 	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1880 		return intstack_top_ptr - INTSTACK_SIZE;
1881 	} else {
1882 		return current_thread()->kernel_stack;
1883 	}
1884 }
1885 vm_size_t
ml_stack_size(void)1886 ml_stack_size(void)
1887 {
1888 	uintptr_t local = (uintptr_t) &local;
1889 	vm_offset_t     intstack_top_ptr;
1890 
1891 	intstack_top_ptr = getCpuDatap()->intstack_top;
1892 	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
1893 		return INTSTACK_SIZE;
1894 	} else {
1895 		return kernel_stack_size;
1896 	}
1897 }
1898 #endif
1899 
1900 #ifdef CONFIG_KCOV
1901 
1902 kcov_cpu_data_t *
current_kcov_data(void)1903 current_kcov_data(void)
1904 {
1905 	return &current_cpu_datap()->cpu_kcov_data;
1906 }
1907 
1908 kcov_cpu_data_t *
cpu_kcov_data(int cpuid)1909 cpu_kcov_data(int cpuid)
1910 {
1911 	return &cpu_datap(cpuid)->cpu_kcov_data;
1912 }
1913 
1914 #endif /* CONFIG_KCOV */
1915 
1916 boolean_t
machine_timeout_suspended(void)1917 machine_timeout_suspended(void)
1918 {
1919 	return FALSE;
1920 }
1921 
1922 kern_return_t
ml_interrupt_prewarm(__unused uint64_t deadline)1923 ml_interrupt_prewarm(__unused uint64_t deadline)
1924 {
1925 	return KERN_FAILURE;
1926 }
1927 
1928 /*
1929  * Assumes fiq, irq disabled.
1930  */
1931 void
ml_set_decrementer(uint32_t dec_value)1932 ml_set_decrementer(uint32_t dec_value)
1933 {
1934 	cpu_data_t      *cdp = getCpuDatap();
1935 
1936 	assert(ml_get_interrupts_enabled() == FALSE);
1937 	cdp->cpu_decrementer = dec_value;
1938 
1939 	if (cdp->cpu_set_decrementer_func) {
1940 		cdp->cpu_set_decrementer_func(dec_value);
1941 	} else {
1942 		__builtin_arm_wsr64("CNTV_TVAL_EL0", (uint64_t)dec_value);
1943 	}
1944 }
1945 
1946 
1947 uint64_t
ml_get_hwclock()1948 ml_get_hwclock()
1949 {
1950 	uint64_t timebase;
1951 
1952 #if   __ARM_ARCH_8_6__
1953 	timebase = __builtin_arm_rsr64("CNTVCTSS_EL0");
1954 #else
1955 	// ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
1956 	// "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
1957 	// to other instructions executed on the same processor."
1958 	__builtin_arm_isb(ISB_SY);
1959 	timebase = __builtin_arm_rsr64("CNTVCT_EL0");
1960 #endif
1961 
1962 	return timebase;
1963 }
1964 
1965 uint64_t
ml_get_timebase()1966 ml_get_timebase()
1967 {
1968 	uint64_t clock, timebase;
1969 
1970 	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
1971 	do {
1972 		timebase = getCpuDatap()->cpu_base_timebase;
1973 		os_compiler_barrier();
1974 		clock = ml_get_hwclock();
1975 		os_compiler_barrier();
1976 	} while (getCpuDatap()->cpu_base_timebase != timebase);
1977 
1978 	return clock + timebase;
1979 }
1980 
1981 /*
1982  * Get the speculative timebase without an ISB.
1983  */
1984 uint64_t
ml_get_speculative_timebase()1985 ml_get_speculative_timebase()
1986 {
1987 	uint64_t clock, timebase;
1988 
1989 	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
1990 	do {
1991 		timebase = getCpuDatap()->cpu_base_timebase;
1992 		os_compiler_barrier();
1993 		clock = __builtin_arm_rsr64("CNTVCT_EL0");
1994 
1995 		os_compiler_barrier();
1996 	} while (getCpuDatap()->cpu_base_timebase != timebase);
1997 
1998 	return clock + timebase;
1999 }
2000 
2001 uint64_t
ml_get_timebase_entropy(void)2002 ml_get_timebase_entropy(void)
2003 {
2004 	return ml_get_speculative_timebase();
2005 }
2006 
2007 uint32_t
ml_get_decrementer()2008 ml_get_decrementer()
2009 {
2010 	cpu_data_t *cdp = getCpuDatap();
2011 	uint32_t dec;
2012 
2013 	assert(ml_get_interrupts_enabled() == FALSE);
2014 
2015 	if (cdp->cpu_get_decrementer_func) {
2016 		dec = cdp->cpu_get_decrementer_func();
2017 	} else {
2018 		uint64_t wide_val;
2019 
2020 		wide_val = __builtin_arm_rsr64("CNTV_TVAL_EL0");
2021 		dec = (uint32_t)wide_val;
2022 		assert(wide_val == (uint64_t)dec);
2023 	}
2024 
2025 	return dec;
2026 }
2027 
2028 boolean_t
ml_get_timer_pending()2029 ml_get_timer_pending()
2030 {
2031 	uint64_t cntv_ctl = __builtin_arm_rsr64("CNTV_CTL_EL0");
2032 	return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
2033 }
2034 
2035 __attribute__((noreturn))
2036 void
platform_syscall(arm_saved_state_t * state)2037 platform_syscall(arm_saved_state_t *state)
2038 {
2039 	uint32_t code;
2040 
2041 #define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2042 
2043 	code = (uint32_t)get_saved_state_reg(state, 3);
2044 
2045 	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_START,
2046 	    get_saved_state_reg(state, 0),
2047 	    get_saved_state_reg(state, 1),
2048 	    get_saved_state_reg(state, 2));
2049 
2050 	switch (code) {
2051 	case 2:
2052 		/* set cthread */
2053 		platform_syscall_kprintf("set cthread self.\n");
2054 		thread_set_cthread_self(get_saved_state_reg(state, 0));
2055 		break;
2056 	case 3:
2057 		/* get cthread */
2058 		platform_syscall_kprintf("get cthread self.\n");
2059 		set_saved_state_reg(state, 0, thread_get_cthread_self());
2060 		break;
2061 	case 0: /* I-Cache flush (removed) */
2062 	case 1: /* D-Cache flush (removed) */
2063 	default:
2064 		platform_syscall_kprintf("unknown: %d\n", code);
2065 		break;
2066 	}
2067 
2068 	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_END,
2069 	    get_saved_state_reg(state, 0));
2070 
2071 	thread_exception_return();
2072 }
2073 
2074 static void
_enable_timebase_event_stream(uint32_t bit_index)2075 _enable_timebase_event_stream(uint32_t bit_index)
2076 {
2077 	uint64_t cntkctl; /* One wants to use 32 bits, but "mrs" prefers it this way */
2078 
2079 	if (bit_index >= 64) {
2080 		panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2081 	}
2082 
2083 	__asm__ volatile ("mrs	%0, CNTKCTL_EL1" : "=r"(cntkctl));
2084 
2085 	cntkctl |= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2086 	cntkctl |= CNTKCTL_EL1_EVNTEN;
2087 	cntkctl |= CNTKCTL_EL1_EVENTDIR; /* 1->0; why not? */
2088 
2089 	/*
2090 	 * If the SOC supports it (and it isn't broken), enable
2091 	 * EL0 access to the timebase registers.
2092 	 */
2093 	if (user_timebase_type() != USER_TIMEBASE_NONE) {
2094 		cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2095 	}
2096 
2097 	__builtin_arm_wsr64("CNTKCTL_EL1", cntkctl);
2098 }
2099 
2100 /*
2101  * Turn timer on, unmask that interrupt.
2102  */
2103 static void
_enable_virtual_timer(void)2104 _enable_virtual_timer(void)
2105 {
2106 	uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
2107 
2108 	__builtin_arm_wsr64("CNTV_CTL_EL0", cntvctl);
2109 	/* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
2110 	__builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2111 }
2112 
2113 void
fiq_context_init(boolean_t enable_fiq __unused)2114 fiq_context_init(boolean_t enable_fiq __unused)
2115 {
2116 	/* Interrupts still disabled. */
2117 	assert(ml_get_interrupts_enabled() == FALSE);
2118 	_enable_virtual_timer();
2119 }
2120 
2121 void
wfe_timeout_init(void)2122 wfe_timeout_init(void)
2123 {
2124 	_enable_timebase_event_stream(arm64_eventi);
2125 }
2126 
2127 /**
2128  * Configures, but does not enable, the WFE event stream. The event stream
2129  * generates an event at a set interval to act as a timeout for WFEs.
2130  *
2131  * This function sets the static global variable arm64_eventi to be the proper
2132  * bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2133  * period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2134  * is used by wfe_timeout_init to actually poke the registers and enable the
2135  * event stream.
2136  *
2137  * The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2138  * is the trigger for the system to generate an event. The trigger can occur on
2139  * either the rising or falling edge of the bit depending on the value of
2140  * CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2141  * falling edge (1->0) transition to generate events.
2142  */
2143 void
wfe_timeout_configure(void)2144 wfe_timeout_configure(void)
2145 {
2146 	/* Could fill in our own ops here, if we needed them */
2147 	uint64_t        ticks_per_sec, ticks_per_event, events_per_sec = 0;
2148 	uint32_t        bit_index;
2149 
2150 	if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
2151 		if (events_per_sec <= 0) {
2152 			events_per_sec = 1;
2153 		} else if (events_per_sec > USEC_PER_SEC) {
2154 			events_per_sec = USEC_PER_SEC;
2155 		}
2156 	} else {
2157 		events_per_sec = USEC_PER_SEC;
2158 	}
2159 	ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2160 	ticks_per_event = ticks_per_sec / events_per_sec;
2161 
2162 	/* Bit index of next power of two greater than ticks_per_event */
2163 	bit_index = flsll(ticks_per_event) - 1;
2164 	/* Round up to next power of two if ticks_per_event is initially power of two */
2165 	if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
2166 		bit_index++;
2167 	}
2168 
2169 	/*
2170 	 * The timer can only trigger on rising or falling edge, not both; we don't
2171 	 * care which we trigger on, but we do need to adjust which bit we are
2172 	 * interested in to account for this.
2173 	 *
2174 	 * In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2175 	 * falling edge of the given bit. Therefore, we must decrement the bit index
2176 	 * by one as when the bit before the one we care about makes a 1 -> 0
2177 	 * transition, the bit we care about makes a 0 -> 1 transition.
2178 	 *
2179 	 * For example if we want an event generated every 8 ticks (if we calculated
2180 	 * a bit_index of 3), we would want the event to be generated whenever the
2181 	 * lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2182 	 * see that the bit at index 2 makes a falling transition in this scenario,
2183 	 * so we would want EVENTI to be 2 instead of 3.
2184 	 */
2185 	if (bit_index != 0) {
2186 		bit_index--;
2187 	}
2188 
2189 	arm64_eventi = bit_index;
2190 }
2191 
2192 boolean_t
ml_delay_should_spin(uint64_t interval)2193 ml_delay_should_spin(uint64_t interval)
2194 {
2195 	cpu_data_t     *cdp = getCpuDatap();
2196 
2197 	if (cdp->cpu_idle_latency) {
2198 		return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2199 	} else {
2200 		/*
2201 		 * Early boot, latency is unknown. Err on the side of blocking,
2202 		 * which should always be safe, even if slow
2203 		 */
2204 		return FALSE;
2205 	}
2206 }
2207 
2208 boolean_t
ml_thread_is64bit(thread_t thread)2209 ml_thread_is64bit(thread_t thread)
2210 {
2211 	return thread_is_64bit_addr(thread);
2212 }
2213 
2214 void
ml_delay_on_yield(void)2215 ml_delay_on_yield(void)
2216 {
2217 #if DEVELOPMENT || DEBUG
2218 	if (yield_delay_us) {
2219 		delay(yield_delay_us);
2220 	}
2221 #endif
2222 }
2223 
2224 void
ml_timer_evaluate(void)2225 ml_timer_evaluate(void)
2226 {
2227 }
2228 
2229 boolean_t
ml_timer_forced_evaluation(void)2230 ml_timer_forced_evaluation(void)
2231 {
2232 	return FALSE;
2233 }
2234 
2235 uint64_t
ml_energy_stat(thread_t t)2236 ml_energy_stat(thread_t t)
2237 {
2238 	return t->machine.energy_estimate_nj;
2239 }
2240 
2241 
2242 void
ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)2243 ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2244 {
2245 	/*
2246 	 * For now: update the resource coalition stats of the
2247 	 * current thread's coalition
2248 	 */
2249 	task_coalition_update_gpu_stats(current_task(), gpu_ns_delta);
2250 }
2251 
2252 uint64_t
ml_gpu_stat(__unused thread_t t)2253 ml_gpu_stat(__unused thread_t t)
2254 {
2255 	return 0;
2256 }
2257 
2258 #if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME || HAS_FAST_CNTVCT
2259 
2260 static void
timer_state_event(boolean_t switch_to_kernel)2261 timer_state_event(boolean_t switch_to_kernel)
2262 {
2263 	thread_t thread = current_thread();
2264 	if (!thread->precise_user_kernel_time) {
2265 		return;
2266 	}
2267 
2268 	processor_t pd = current_processor();
2269 	uint64_t now = ml_get_speculative_timebase();
2270 
2271 	timer_stop(pd->current_state, now);
2272 	pd->current_state = (switch_to_kernel) ? &pd->system_state : &pd->user_state;
2273 	timer_start(pd->current_state, now);
2274 
2275 	timer_stop(pd->thread_timer, now);
2276 	pd->thread_timer = (switch_to_kernel) ? &thread->system_timer : &thread->user_timer;
2277 	timer_start(pd->thread_timer, now);
2278 }
2279 
2280 void
timer_state_event_user_to_kernel(void)2281 timer_state_event_user_to_kernel(void)
2282 {
2283 	timer_state_event(TRUE);
2284 }
2285 
2286 void
timer_state_event_kernel_to_user(void)2287 timer_state_event_kernel_to_user(void)
2288 {
2289 	timer_state_event(FALSE);
2290 }
2291 #endif /* !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME || HAS_FAST_CNTVCT */
2292 
2293 thread_t
current_thread(void)2294 current_thread(void)
2295 {
2296 	return current_thread_fast();
2297 }
2298 
2299 typedef struct{
2300 	ex_cb_t         cb;
2301 	void            *refcon;
2302 }
2303 ex_cb_info_t;
2304 
2305 ex_cb_info_t ex_cb_info[EXCB_CLASS_MAX];
2306 
2307 /*
2308  * Callback registration
2309  * Currently we support only one registered callback per class but
2310  * it should be possible to support more callbacks
2311  */
2312 kern_return_t
ex_cb_register(ex_cb_class_t cb_class,ex_cb_t cb,void * refcon)2313 ex_cb_register(
2314 	ex_cb_class_t   cb_class,
2315 	ex_cb_t                 cb,
2316 	void                    *refcon)
2317 {
2318 	ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2319 
2320 	if ((NULL == cb) || (cb_class >= EXCB_CLASS_MAX)) {
2321 		return KERN_INVALID_VALUE;
2322 	}
2323 
2324 	if (NULL == pInfo->cb) {
2325 		pInfo->cb = cb;
2326 		pInfo->refcon = refcon;
2327 		return KERN_SUCCESS;
2328 	}
2329 	return KERN_FAILURE;
2330 }
2331 
2332 /*
2333  * Called internally by platform kernel to invoke the registered callback for class
2334  */
2335 ex_cb_action_t
ex_cb_invoke(ex_cb_class_t cb_class,vm_offset_t far)2336 ex_cb_invoke(
2337 	ex_cb_class_t   cb_class,
2338 	vm_offset_t             far)
2339 {
2340 	ex_cb_info_t *pInfo = &ex_cb_info[cb_class];
2341 	ex_cb_state_t state = {far};
2342 
2343 	if (cb_class >= EXCB_CLASS_MAX) {
2344 		panic("Invalid exception callback class 0x%x", cb_class);
2345 	}
2346 
2347 	if (pInfo->cb) {
2348 		return pInfo->cb(cb_class, pInfo->refcon, &state);
2349 	}
2350 	return EXCB_ACTION_NONE;
2351 }
2352 
2353 #if defined(HAS_APPLE_PAC)
2354 void
ml_task_set_disable_user_jop(task_t task,uint8_t disable_user_jop)2355 ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2356 {
2357 	assert(task);
2358 	task->disable_user_jop = disable_user_jop;
2359 }
2360 
2361 void
ml_thread_set_disable_user_jop(thread_t thread,uint8_t disable_user_jop)2362 ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2363 {
2364 	assert(thread);
2365 	thread->machine.disable_user_jop = disable_user_jop;
2366 }
2367 
2368 void
ml_task_set_rop_pid(task_t task,task_t parent_task,boolean_t inherit)2369 ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2370 {
2371 	if (inherit) {
2372 		task->rop_pid = parent_task->rop_pid;
2373 	} else {
2374 		task->rop_pid = early_random();
2375 	}
2376 }
2377 
2378 /**
2379  * jop_pid may be inherited from the parent task or generated inside the shared
2380  * region.  Unfortunately these two parameters are available at very different
2381  * times during task creation, so we need to split this into two steps.
2382  */
2383 void
ml_task_set_jop_pid(task_t task,task_t parent_task,boolean_t inherit)2384 ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit)
2385 {
2386 	if (inherit) {
2387 		task->jop_pid = parent_task->jop_pid;
2388 	} else {
2389 		task->jop_pid = ml_default_jop_pid();
2390 	}
2391 }
2392 
2393 void
ml_task_set_jop_pid_from_shared_region(task_t task)2394 ml_task_set_jop_pid_from_shared_region(task_t task)
2395 {
2396 	vm_shared_region_t sr = vm_shared_region_get(task);
2397 	/*
2398 	 * If there's no shared region, we can assign the key arbitrarily.  This
2399 	 * typically happens when Mach-O image activation failed part of the way
2400 	 * through, and this task is in the middle of dying with SIGKILL anyway.
2401 	 */
2402 	if (__improbable(!sr)) {
2403 		task->jop_pid = early_random();
2404 		return;
2405 	}
2406 	vm_shared_region_deallocate(sr);
2407 
2408 	/*
2409 	 * Similarly we have to worry about jetsam having killed the task and
2410 	 * already cleared the shared_region_id.
2411 	 */
2412 	task_lock(task);
2413 	if (task->shared_region_id != NULL) {
2414 		task->jop_pid = shared_region_find_key(task->shared_region_id);
2415 	} else {
2416 		task->jop_pid = early_random();
2417 	}
2418 	task_unlock(task);
2419 }
2420 
2421 void
ml_thread_set_jop_pid(thread_t thread,task_t task)2422 ml_thread_set_jop_pid(thread_t thread, task_t task)
2423 {
2424 	thread->machine.jop_pid = task->jop_pid;
2425 }
2426 #endif /* defined(HAS_APPLE_PAC) */
2427 
2428 #if defined(HAS_APPLE_PAC)
2429 #ifdef __ARM_ARCH_8_6__
2430 /**
2431  * Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2432  */
2433 static void *
ml_poison_ptr(void * ptr,ptrauth_key key)2434 ml_poison_ptr(void *ptr, ptrauth_key key)
2435 {
2436 	bool b_key = key & (1ULL << 0);
2437 	uint64_t error_code;
2438 	if (b_key) {
2439 		error_code = 2;
2440 	} else {
2441 		error_code = 1;
2442 	}
2443 
2444 	bool kernel_pointer = (uintptr_t)ptr & (1ULL << 55);
2445 	bool data_key = key & (1ULL << 1);
2446 	/* When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters */
2447 	bool tbi = data_key && !kernel_pointer;
2448 	unsigned int poison_shift;
2449 	if (tbi) {
2450 		poison_shift = 53;
2451 	} else {
2452 		poison_shift = 61;
2453 	}
2454 
2455 	uintptr_t poisoned = (uintptr_t)ptr;
2456 	poisoned &= ~(3ULL << poison_shift);
2457 	poisoned |= error_code << poison_shift;
2458 	return (void *)poisoned;
2459 }
2460 
2461 /*
2462  * ptrauth_sign_unauthenticated() reimplemented using asm volatile, forcing the
2463  * compiler to assume this operation has side-effects and cannot be reordered
2464  */
2465 #define ptrauth_sign_volatile(__value, __suffix, __data)                \
2466 	({                                                              \
2467 	        void *__ret = __value;                                  \
2468 	        asm volatile (                                          \
2469 	                "pac" #__suffix "	%[value], %[data]"          \
2470 	                : [value] "+r"(__ret)                           \
2471 	                : [data] "r"(__data)                            \
2472 	        );                                                      \
2473 	        __ret;                                                  \
2474 	})
2475 
2476 #define ml_auth_ptr_unchecked_for_key(_ptr, _suffix, _key, _modifier)                           \
2477 	do {                                                                                    \
2478 	        void *stripped = ptrauth_strip(_ptr, _key);                                     \
2479 	        void *reauthed = ptrauth_sign_volatile(stripped, _suffix, _modifier);           \
2480 	        if (__probable(_ptr == reauthed)) {                                             \
2481 	                _ptr = stripped;                                                        \
2482 	        } else {                                                                        \
2483 	                _ptr = ml_poison_ptr(stripped, _key);                                   \
2484 	        }                                                                               \
2485 	} while (0)
2486 
2487 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2488 	ml_auth_ptr_unchecked_for_key(_ptr, _suffix, ptrauth_key_as ## _suffix, _modifier)
2489 #else
2490 #define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2491 	asm volatile ("aut" #_suffix " %[ptr], %[modifier]" : [ptr] "+r"(_ptr) : [modifier] "r"(_modifier));
2492 #endif /* __ARM_ARCH_8_6__ */
2493 
2494 /**
2495  * Authenticates a signed pointer without trapping on failure.
2496  *
2497  * @warning This function must be called with interrupts disabled.
2498  *
2499  * @warning Pointer authentication failure should normally be treated as a fatal
2500  * error.  This function is intended for a handful of callers that cannot panic
2501  * on failure, and that understand the risks in handling a poisoned return
2502  * value.  Other code should generally use the trapping variant
2503  * ptrauth_auth_data() instead.
2504  *
2505  * @param ptr the pointer to authenticate
2506  * @param key which key to use for authentication
2507  * @param modifier a modifier to mix into the key
2508  * @return an authenticated version of ptr, possibly with poison bits set
2509  */
2510 void *
ml_auth_ptr_unchecked(void * ptr,ptrauth_key key,uint64_t modifier)2511 ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier)
2512 {
2513 	switch (key & 0x3) {
2514 	case ptrauth_key_asia:
2515 		_ml_auth_ptr_unchecked(ptr, ia, modifier);
2516 		break;
2517 	case ptrauth_key_asib:
2518 		_ml_auth_ptr_unchecked(ptr, ib, modifier);
2519 		break;
2520 	case ptrauth_key_asda:
2521 		_ml_auth_ptr_unchecked(ptr, da, modifier);
2522 		break;
2523 	case ptrauth_key_asdb:
2524 		_ml_auth_ptr_unchecked(ptr, db, modifier);
2525 		break;
2526 	}
2527 
2528 	return ptr;
2529 }
2530 #endif /* defined(HAS_APPLE_PAC) */
2531 
2532 #ifdef CONFIG_XNUPOST
2533 void
ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler,uintptr_t expected_fault_addr)2534 ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2535 {
2536 	thread_t thread = current_thread();
2537 	thread->machine.expected_fault_handler = expected_fault_handler;
2538 	thread->machine.expected_fault_addr = expected_fault_addr;
2539 }
2540 
2541 void
ml_expect_fault_end(void)2542 ml_expect_fault_end(void)
2543 {
2544 	thread_t thread = current_thread();
2545 	thread->machine.expected_fault_handler = NULL;
2546 	thread->machine.expected_fault_addr = 0;
2547 }
2548 #endif /* CONFIG_XNUPOST */
2549 
2550 void
ml_hibernate_active_pre(void)2551 ml_hibernate_active_pre(void)
2552 {
2553 #if HIBERNATION
2554 	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2555 
2556 		hibernate_rebuild_vm_structs();
2557 	}
2558 #endif /* HIBERNATION */
2559 }
2560 
2561 void
ml_hibernate_active_post(void)2562 ml_hibernate_active_post(void)
2563 {
2564 #if HIBERNATION
2565 	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2566 		hibernate_machine_init();
2567 		hibernate_vm_lock_end();
2568 		current_cpu_datap()->cpu_hibernate = 0;
2569 	}
2570 #endif /* HIBERNATION */
2571 }
2572 
2573 /**
2574  * Return back a machine-dependent array of address space regions that should be
2575  * reserved by the VM (pre-mapped in the address space). This will prevent user
2576  * processes from allocating or deallocating from within these regions.
2577  *
2578  * @param vm_is64bit True if the process has a 64-bit address space.
2579  * @param regions An out parameter representing an array of regions to reserve.
2580  *
2581  * @return The number of reserved regions returned through `regions`.
2582  */
2583 size_t
ml_get_vm_reserved_regions(bool vm_is64bit,struct vm_reserved_region ** regions)2584 ml_get_vm_reserved_regions(bool vm_is64bit, struct vm_reserved_region **regions)
2585 {
2586 	assert(regions != NULL);
2587 
2588 	/**
2589 	 * Reserved regions only apply to 64-bit address spaces. This is because
2590 	 * we only expect to grow the maximum user VA address on 64-bit address spaces
2591 	 * (we've essentially already reached the max for 32-bit spaces). The reserved
2592 	 * regions should safely fall outside of the max user VA for 32-bit processes.
2593 	 */
2594 	if (vm_is64bit) {
2595 		*regions = vm_reserved_regions;
2596 		return ARRAY_COUNT(vm_reserved_regions);
2597 	} else {
2598 		/* Don't reserve any VA regions on arm64_32 processes. */
2599 		*regions = NULL;
2600 		return 0;
2601 	}
2602 }
2603 /* These WFE recommendations are expected to be updated on a relatively
2604  * infrequent cadence, possibly from a different cluster, hence
2605  * false cacheline sharing isn't expected to be material
2606  */
2607 static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
2608 
2609 uint32_t
ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id,uint64_t wfe_timeout_abstime_interval,__unused uint64_t wfe_hint_flags)2610 ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
2611 {
2612 	assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
2613 	assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
2614 	os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
2615 	return 0; /* Success */
2616 }
2617 
2618 #if DEVELOPMENT || DEBUG
2619 int wfe_rec_max = 0;
2620 int wfe_rec_none = 0;
2621 uint64_t wfe_rec_override_mat = 0;
2622 uint64_t wfe_rec_clamp = 0;
2623 #endif
2624 
2625 uint64_t
ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)2626 ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
2627 {
2628 	/* This and its consumer does not synchronize vis-a-vis updates
2629 	 * of the recommendation; races are acceptable.
2630 	 */
2631 	uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
2632 #if DEVELOPMENT || DEBUG
2633 	if (wfe_rec_clamp) {
2634 		wfet = MIN(wfe_rec_clamp, wfet);
2635 	}
2636 
2637 	if (wfe_rec_max) {
2638 		for (int i = 0; i < MAX_CPU_CLUSTERS; i++) {
2639 			if (arm64_cluster_wfe_recs[i] > wfet) {
2640 				wfet = arm64_cluster_wfe_recs[i];
2641 			}
2642 		}
2643 	}
2644 
2645 	if (wfe_rec_none) {
2646 		wfet = 0;
2647 	}
2648 
2649 	if (wfe_rec_override_mat) {
2650 		wfet = wfe_rec_override_mat;
2651 	}
2652 #endif
2653 	return wfet;
2654 }
2655