xref: /xnu-10002.81.5/osfmk/arm/pmap/pmap.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap.h>
84 
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89 
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93 
94 #include <pexpert/device_tree.h>
95 
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98 
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102 
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106 
107 
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111 
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121 
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123 
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125 
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131 
132 const struct page_table_ops native_pt_ops =
133 {
134 	.alloc_id = alloc_asid,
135 	.free_id = free_asid,
136 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 	.wimg_to_pte = wimg_to_pte,
139 };
140 
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 	[0] = {
144 		.size       = ARM_16K_TT_L0_SIZE,
145 		.offmask    = ARM_16K_TT_L0_OFFMASK,
146 		.shift      = ARM_16K_TT_L0_SHIFT,
147 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 		.valid_mask = ARM_TTE_VALID,
149 		.type_mask  = ARM_TTE_TYPE_MASK,
150 		.type_block = ARM_TTE_TYPE_BLOCK
151 	},
152 	[1] = {
153 		.size       = ARM_16K_TT_L1_SIZE,
154 		.offmask    = ARM_16K_TT_L1_OFFMASK,
155 		.shift      = ARM_16K_TT_L1_SHIFT,
156 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 		.valid_mask = ARM_TTE_VALID,
158 		.type_mask  = ARM_TTE_TYPE_MASK,
159 		.type_block = ARM_TTE_TYPE_BLOCK
160 	},
161 	[2] = {
162 		.size       = ARM_16K_TT_L2_SIZE,
163 		.offmask    = ARM_16K_TT_L2_OFFMASK,
164 		.shift      = ARM_16K_TT_L2_SHIFT,
165 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 		.valid_mask = ARM_TTE_VALID,
167 		.type_mask  = ARM_TTE_TYPE_MASK,
168 		.type_block = ARM_TTE_TYPE_BLOCK
169 	},
170 	[3] = {
171 		.size       = ARM_16K_TT_L3_SIZE,
172 		.offmask    = ARM_16K_TT_L3_OFFMASK,
173 		.shift      = ARM_16K_TT_L3_SHIFT,
174 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 		.valid_mask = ARM_PTE_TYPE_VALID,
176 		.type_mask  = ARM_PTE_TYPE_MASK,
177 		.type_block = ARM_TTE_TYPE_L3BLOCK
178 	}
179 };
180 
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 	[0] = {
184 		.size       = ARM_4K_TT_L0_SIZE,
185 		.offmask    = ARM_4K_TT_L0_OFFMASK,
186 		.shift      = ARM_4K_TT_L0_SHIFT,
187 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 		.valid_mask = ARM_TTE_VALID,
189 		.type_mask  = ARM_TTE_TYPE_MASK,
190 		.type_block = ARM_TTE_TYPE_BLOCK
191 	},
192 	[1] = {
193 		.size       = ARM_4K_TT_L1_SIZE,
194 		.offmask    = ARM_4K_TT_L1_OFFMASK,
195 		.shift      = ARM_4K_TT_L1_SHIFT,
196 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 		.valid_mask = ARM_TTE_VALID,
198 		.type_mask  = ARM_TTE_TYPE_MASK,
199 		.type_block = ARM_TTE_TYPE_BLOCK
200 	},
201 	[2] = {
202 		.size       = ARM_4K_TT_L2_SIZE,
203 		.offmask    = ARM_4K_TT_L2_OFFMASK,
204 		.shift      = ARM_4K_TT_L2_SHIFT,
205 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 		.valid_mask = ARM_TTE_VALID,
207 		.type_mask  = ARM_TTE_TYPE_MASK,
208 		.type_block = ARM_TTE_TYPE_BLOCK
209 	},
210 	[3] = {
211 		.size       = ARM_4K_TT_L3_SIZE,
212 		.offmask    = ARM_4K_TT_L3_OFFMASK,
213 		.shift      = ARM_4K_TT_L3_SHIFT,
214 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 		.valid_mask = ARM_PTE_TYPE_VALID,
216 		.type_mask  = ARM_PTE_TYPE_MASK,
217 		.type_block = ARM_TTE_TYPE_L3BLOCK
218 	}
219 };
220 
221 const struct page_table_attr pmap_pt_attr_4k = {
222 	.pta_level_info = pmap_table_level_info_4k,
223 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 	.pta_max_level  = PMAP_TT_L3_LEVEL,
234 	.pta_ops = &native_pt_ops,
235 	.ap_ro = ARM_PTE_AP(AP_RORO),
236 	.ap_rw = ARM_PTE_AP(AP_RWRW),
237 	.ap_rona = ARM_PTE_AP(AP_RONA),
238 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
239 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 	.ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 	.pta_tcr_value  = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 	.pta_page_size  = 4096,
245 	.pta_pagezero_size = 4096,
246 	.pta_page_shift = 12,
247 };
248 
249 const struct page_table_attr pmap_pt_attr_16k = {
250 	.pta_level_info = pmap_table_level_info_16k,
251 	.pta_root_level = PMAP_TT_L1_LEVEL,
252 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
253 	.pta_max_level  = PMAP_TT_L3_LEVEL,
254 	.pta_ops = &native_pt_ops,
255 	.ap_ro = ARM_PTE_AP(AP_RORO),
256 	.ap_rw = ARM_PTE_AP(AP_RWRW),
257 	.ap_rona = ARM_PTE_AP(AP_RONA),
258 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
259 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
260 	.ap_x = ARM_PTE_PNX,
261 #if __ARM_MIXED_PAGE_SIZE__
262 	.pta_tcr_value  = TCR_EL1_16KB,
263 #endif /* __ARM_MIXED_PAGE_SIZE__ */
264 	.pta_page_size  = 16384,
265 	.pta_pagezero_size = 16384,
266 	.pta_page_shift = 14,
267 };
268 
269 #if __ARM_16K_PG__
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
271 #else /* !__ARM_16K_PG__ */
272 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
273 #endif /* !__ARM_16K_PG__ */
274 
275 
276 #if MACH_ASSERT
277 int vm_footprint_suspend_allowed = 1;
278 
279 extern int pmap_ledgers_panic;
280 extern int pmap_ledgers_panic_leeway;
281 
282 #endif /* MACH_ASSERT */
283 
284 #if DEVELOPMENT || DEBUG
285 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
286 	(current_thread()->pmap_footprint_suspended)
287 #else /* DEVELOPMENT || DEBUG */
288 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
289 #endif /* DEVELOPMENT || DEBUG */
290 
291 
292 /*
293  * Represents a tlb range that will be flushed before exiting
294  * the ppl.
295  * Used by phys_attribute_clear_range to defer flushing pages in
296  * this range until the end of the operation.
297  */
298 typedef struct pmap_tlb_flush_range {
299 	pmap_t ptfr_pmap;
300 	vm_map_address_t ptfr_start;
301 	vm_map_address_t ptfr_end;
302 	bool ptfr_flush_needed;
303 } pmap_tlb_flush_range_t;
304 
305 #if XNU_MONITOR
306 /*
307  * PPL External References.
308  */
309 extern vm_offset_t   segPPLDATAB;
310 extern unsigned long segSizePPLDATA;
311 extern vm_offset_t   segPPLTEXTB;
312 extern unsigned long segSizePPLTEXT;
313 extern vm_offset_t   segPPLDATACONSTB;
314 extern unsigned long segSizePPLDATACONST;
315 
316 
317 /*
318  * PPL Global Variables
319  */
320 
321 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
322 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
323 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
324 #else
325 const boolean_t pmap_ppl_disable = FALSE;
326 #endif
327 
328 /*
329  * Indicates if the PPL has started applying APRR.
330  * This variable is accessed from various assembly trampolines, so be sure to change
331  * those if you change the size or layout of this variable.
332  */
333 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
334 
335 extern void *pmap_stacks_start;
336 extern void *pmap_stacks_end;
337 
338 #endif /* !XNU_MONITOR */
339 
340 
341 
342 /* Virtual memory region for early allocation */
343 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
344 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
345 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
346 
347 extern uint8_t bootstrap_pagetables[];
348 
349 extern unsigned int not_in_kdp;
350 
351 extern vm_offset_t first_avail;
352 
353 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
354 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
355 extern vm_offset_t     static_memory_end;
356 
357 extern const vm_map_address_t physmap_base;
358 extern const vm_map_address_t physmap_end;
359 
360 extern int maxproc, hard_maxproc;
361 
362 /* The number of address bits one TTBR can cover. */
363 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
364 
365 /*
366  * The bounds on our TTBRs.  These are for sanity checking that
367  * an address is accessible by a TTBR before we attempt to map it.
368  */
369 
370 /* The level of the root of a page table. */
371 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
372 
373 /* The number of entries in the root TT of a page table. */
374 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
375 
376 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
377 const pmap_t    kernel_pmap = &kernel_pmap_store;
378 
379 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
380 
381 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
382 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
383 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
384 
385 typedef struct tt_free_entry {
386 	struct tt_free_entry    *next;
387 } tt_free_entry_t;
388 
389 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
390 
391 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
392 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
393 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
394 #define FREE_PAGE_SIZE_TT_MAX   4
395 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
396 unsigned int    free_two_page_size_tt_count MARK_AS_PMAP_DATA;
397 unsigned int    free_two_page_size_tt_max MARK_AS_PMAP_DATA;
398 #define FREE_TWO_PAGE_SIZE_TT_MAX       4
399 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
400 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
401 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
402 
403 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
404 
405 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
406 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
407 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
408 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
409 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
410 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
411 
412 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
413 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
414 
415 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
416 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
417 
418 /* Lock group used for all pmap object locks. */
419 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
420 
421 #if DEVELOPMENT || DEBUG
422 int nx_enabled = 1;                                     /* enable no-execute protection */
423 int allow_data_exec  = 0;                               /* No apps may execute data */
424 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
425 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
426 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
427 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
428 #else /* DEVELOPMENT || DEBUG */
429 const int nx_enabled = 1;                                       /* enable no-execute protection */
430 const int allow_data_exec  = 0;                         /* No apps may execute data */
431 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
432 #endif /* DEVELOPMENT || DEBUG */
433 
434 /**
435  * This variable is set true during hibernation entry to protect pmap data structures
436  * during image copying, and reset false on hibernation exit.
437  */
438 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
439 
440 #if MACH_ASSERT
441 static void pmap_check_ledgers(pmap_t pmap);
442 #else
443 static inline void
pmap_check_ledgers(__unused pmap_t pmap)444 pmap_check_ledgers(__unused pmap_t pmap)
445 {
446 }
447 #endif /* MACH_ASSERT */
448 
449 /**
450  * This helper function ensures that potentially-long-running batched PPL operations are
451  * called in preemptible context before entering the PPL, so that the PPL call may
452  * periodically exit to allow pending urgent ASTs to be taken.
453  */
454 static inline void
pmap_verify_preemptible(void)455 pmap_verify_preemptible(void)
456 {
457 	assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
458 }
459 
460 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
461 
462 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
463 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
464 
465 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
466 
467 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
468 #if defined(__arm64__)
469 /* end of shared region + 512MB for various purposes */
470 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
471 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
472     "Minimum address space size outside allowable range");
473 
474 // Max offset is 15.375GB for devices with "large" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
476 // Max offset is 11.375GB for devices with "small" memory config
477 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
478 
479 
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
481     "Large device address space size outside allowable range");
482 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
483     "Small device address space size outside allowable range");
484 
485 #  ifdef XNU_TARGET_OS_OSX
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
487 #  else
488 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
489 #  endif
490 #endif /* __arm64__ */
491 
492 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
493 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
494 #else
495 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
496 #endif
497 
498 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
499 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
500 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
501 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
502 #if !HAS_16BIT_ASID
503 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
504 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
505 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
506 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
507 #else
508 static uint16_t last_allocated_asid = 0;
509 #endif /* !HAS_16BIT_ASID */
510 
511 
512 #if __ARM_MIXED_PAGE_SIZE__
513 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
514 #endif
515 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
516 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
517 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
518 
519 /* PTE Define Macros */
520 
521 #define ARM_PTE_IS_COMPRESSED(x, p) \
522 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
523 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
524 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
525 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
526 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
527 
528 #define pte_is_wired(pte)                                                               \
529 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
530 
531 #define pte_was_writeable(pte) \
532 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
533 
534 #define pte_set_was_writeable(pte, was_writeable) \
535 	do {                                         \
536 	        if ((was_writeable)) {               \
537 	                (pte) |= ARM_PTE_WRITEABLE;  \
538 	        } else {                             \
539 	                (pte) &= ~ARM_PTE_WRITEABLE; \
540 	        }                                    \
541 	} while(0)
542 
543 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)544 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
545 {
546 	if (wired) {
547 		*ptep |= ARM_PTE_WIRED;
548 	} else {
549 		*ptep &= ~ARM_PTE_WIRED;
550 	}
551 	/*
552 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
553 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
554 	 * never reclaimed.
555 	 */
556 	if (pmap == kernel_pmap) {
557 		return;
558 	}
559 	unsigned short *ptd_wiredcnt_ptr;
560 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
561 	if (wired) {
562 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
563 	} else {
564 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
565 		if (__improbable(prev_wired == 0)) {
566 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
567 		}
568 	}
569 }
570 
571 #if HAS_FEAT_XS
572 
573 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)574 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
575 {
576 	if (__improbable(pt_attr->stage2)) {
577 		return false;
578 	}
579 	switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
580 	case CACHE_ATTRINDX_POSTED_XS:
581 	case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
582 		return true;
583 	default:
584 		return false;
585 	}
586 }
587 
588 #endif /* HAS_FEAT_XS */
589 
590 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                               \
591 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
592 	arm64_sync_tlb(strong);                                                                               \
593 }
594 
595 /*
596  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
597  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
598  * will observe the updated PTE.
599  */
600 #define FLUSH_PTE()                                                                     \
601 	__builtin_arm_dmb(DMB_ISH);
602 
603 /*
604  * Synchronize updates to PTEs that were previously valid and thus may be cached in
605  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
606  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
607  * program order will not issue until the DSB completes.  Prior loads may be reordered
608  * after the barrier, but their behavior should not be materially affected by the
609  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
610  * matter for loads until the access is re-driven well after the TLB update is
611  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
612  * we should be in a position to handle access faults.  For "voluntary" PTE access
613  * restriction due to unmapping or protection, the decision to restrict access should
614  * have a data dependency on prior loads in order to avoid a data race.
615  */
616 #define FLUSH_PTE_STRONG()                                                             \
617 	__builtin_arm_dsb(DSB_ISHST);
618 
619 /**
620  * Write enough page table entries to map a single VM page. On systems where the
621  * VM page size does not match the hardware page size, multiple page table
622  * entries will need to be written.
623  *
624  * @note This function does not emit a barrier to ensure these page table writes
625  *       have completed before continuing. This is commonly needed. In the case
626  *       where a DMB or DSB barrier is needed, then use the write_pte() and
627  *       write_pte_strong() functions respectively instead of this one.
628  *
629  * @param ptep Pointer to the first page table entry to update.
630  * @param pte The value to write into each page table entry. In the case that
631  *            multiple PTEs are updated to a non-empty value, then the address
632  *            in this value will automatically be incremented for each PTE
633  *            write.
634  */
635 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)636 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
637 {
638 	/**
639 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
640 	 * systems, which is why it's checked at runtime instead of compile time.
641 	 * The "unreachable" warning needs to be suppressed because it still is a
642 	 * compile time constant on some systems.
643 	 */
644 	__unreachable_ok_push
645 	if (TEST_PAGE_RATIO_4) {
646 		if (((uintptr_t)ptep) & 0x1f) {
647 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
648 			    __func__, ptep, (void*)pte);
649 		}
650 
651 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
652 			/**
653 			 * If we're writing an empty/compressed PTE value, then don't
654 			 * auto-increment the address for each PTE write.
655 			 */
656 			*ptep = pte;
657 			*(ptep + 1) = pte;
658 			*(ptep + 2) = pte;
659 			*(ptep + 3) = pte;
660 		} else {
661 			*ptep = pte;
662 			*(ptep + 1) = pte | 0x1000;
663 			*(ptep + 2) = pte | 0x2000;
664 			*(ptep + 3) = pte | 0x3000;
665 		}
666 	} else {
667 		*ptep = pte;
668 	}
669 	__unreachable_ok_pop
670 }
671 
672 /**
673  * Writes enough page table entries to map a single VM page and then ensures
674  * those writes complete by executing a Data Memory Barrier.
675  *
676  * @note The DMB issued by this function is not strong enough to protect against
677  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
678  *       instruction is going to immediately be called after this write, it's
679  *       recommended to call write_pte_strong() instead of this function.
680  *
681  * See the function header for write_pte_fast() for more details on the
682  * parameters.
683  */
684 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)685 write_pte(pt_entry_t *ptep, pt_entry_t pte)
686 {
687 	write_pte_fast(ptep, pte);
688 	FLUSH_PTE();
689 }
690 
691 /**
692  * Writes enough page table entries to map a single VM page and then ensures
693  * those writes complete by executing a Data Synchronization Barrier. This
694  * barrier provides stronger guarantees than the DMB executed by write_pte().
695  *
696  * @note This function is useful if you're going to immediately flush the TLB
697  *       after making the PTE write. A DSB is required to protect against the
698  *       TLB invalidate being reordered before the PTE write.
699  *
700  * See the function header for write_pte_fast() for more details on the
701  * parameters.
702  */
703 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)704 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
705 {
706 	write_pte_fast(ptep, pte);
707 	FLUSH_PTE_STRONG();
708 }
709 
710 /**
711  * Retrieve the pmap structure for the thread running on the current CPU.
712  */
713 pmap_t
current_pmap()714 current_pmap()
715 {
716 	const pmap_t current = vm_map_pmap(current_thread()->map);
717 
718 	assert(current != NULL);
719 
720 #if XNU_MONITOR
721 	/**
722 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
723 	 * decided by kernel-writable memory. This function is used in various parts
724 	 * of the PPL, and besides validating that the pointer returned by this
725 	 * function is indeed a pmap structure, it's also important to ensure that
726 	 * it's actually the current thread's pmap. This is because different pmaps
727 	 * will have access to different entitlements based on the code signature of
728 	 * their loaded process. So if a different user pmap is set in the current
729 	 * thread structure (in an effort to bypass code signing restrictions), even
730 	 * though the structure would validate correctly as it is a real pmap
731 	 * structure, it should fail here.
732 	 *
733 	 * This only needs to occur for user pmaps because the kernel pmap's root
734 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
735 	 * changed so it'd be redundant to check), and its code signing fields are
736 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
737 	 * it shouldn't be possible to set those fields. Due to that, an attacker
738 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
739 	 * this check won't accomplish anything as it doesn't provide any extra code
740 	 * signing entitlements.
741 	 */
742 	if ((current != kernel_pmap) &&
743 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
744 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
745 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
746 	}
747 #endif /* XNU_MONITOR */
748 
749 	return current;
750 }
751 
752 #if DEVELOPMENT || DEBUG
753 
754 /*
755  * Trace levels are controlled by a bitmask in which each
756  * level can be enabled/disabled by the (1<<level) position
757  * in the boot arg
758  * Level 0: PPL extension functionality
759  * Level 1: pmap lifecycle (create/destroy/switch)
760  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
761  * Level 3: internal state management (attributes/fast-fault)
762  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
763  */
764 
765 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
766 
767 #define PMAP_TRACE(level, ...) \
768 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
769 	        KDBG_RELEASE(__VA_ARGS__); \
770 	}
771 #else /* DEVELOPMENT || DEBUG */
772 
773 #define PMAP_TRACE(level, ...)
774 
775 #endif /* DEVELOPMENT || DEBUG */
776 
777 
778 /*
779  * Internal function prototypes (forward declarations).
780  */
781 
782 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
783 
784 static void pmap_set_reference(ppnum_t pn);
785 
786 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
787 
788 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
789 
790 static kern_return_t pmap_expand(
791 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
792 
793 static int pmap_remove_range(
794 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
795 
796 static tt_entry_t *pmap_tt1_allocate(
797 	pmap_t, vm_size_t, unsigned int);
798 
799 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
800 
801 static void pmap_tt1_deallocate(
802 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
803 
804 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
805 
806 static kern_return_t pmap_tt_allocate(
807 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
808 
809 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
810 
811 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
812 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
813 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
814 
815 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
816 
817 
818 static void pmap_unmap_commpage(
819 	pmap_t pmap);
820 
821 static boolean_t
822 pmap_is_64bit(pmap_t);
823 
824 
825 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
826 
827 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
828 
829 static bool pmap_update_cache_attributes_locked(
830 	ppnum_t, unsigned, bool);
831 
832 static boolean_t arm_clear_fast_fault(
833 	ppnum_t ppnum,
834 	vm_prot_t fault_type,
835 	pt_entry_t *pte_p);
836 
837 static void pmap_trim_self(pmap_t pmap);
838 static void pmap_trim_subord(pmap_t subord);
839 
840 
841 /*
842  * Temporary prototypes, while we wait for pmap_enter to move to taking an
843  * address instead of a page number.
844  */
845 static kern_return_t
846 pmap_enter_addr(
847 	pmap_t pmap,
848 	vm_map_address_t v,
849 	pmap_paddr_t pa,
850 	vm_prot_t prot,
851 	vm_prot_t fault_type,
852 	unsigned int flags,
853 	boolean_t wired);
854 
855 kern_return_t
856 pmap_enter_options_addr(
857 	pmap_t pmap,
858 	vm_map_address_t v,
859 	pmap_paddr_t pa,
860 	vm_prot_t prot,
861 	vm_prot_t fault_type,
862 	unsigned int flags,
863 	boolean_t wired,
864 	unsigned int options,
865 	__unused void   *arg,
866 	__unused pmap_mapping_type_t mapping_type);
867 
868 #ifdef CONFIG_XNUPOST
869 kern_return_t pmap_test(void);
870 #endif /* CONFIG_XNUPOST */
871 
872 PMAP_SUPPORT_PROTOTYPES(
873 	kern_return_t,
874 	arm_fast_fault, (pmap_t pmap,
875 	vm_map_address_t va,
876 	vm_prot_t fault_type,
877 	bool was_af_fault,
878 	bool from_user), ARM_FAST_FAULT_INDEX);
879 
880 PMAP_SUPPORT_PROTOTYPES(
881 	boolean_t,
882 	arm_force_fast_fault, (ppnum_t ppnum,
883 	vm_prot_t allow_mode,
884 	int options), ARM_FORCE_FAST_FAULT_INDEX);
885 
886 MARK_AS_PMAP_TEXT static boolean_t
887 arm_force_fast_fault_with_flush_range(
888 	ppnum_t ppnum,
889 	vm_prot_t allow_mode,
890 	int options,
891 	pmap_tlb_flush_range_t *flush_range);
892 
893 /**
894  * Definition of the states driving the batch cache attributes update
895  * state machine.
896  */
897 typedef struct {
898 	uint64_t page_index : 32,           /* The page index to be operated on */
899 	    state : 8,                      /* The current state of the update machine */
900 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
901 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
902 	:0;
903 } batch_set_cache_attr_state_t;
904 
905 /* Possible values of the "state" field. */
906 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
907 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
908 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
909 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
910 
911 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
912 
913 PMAP_SUPPORT_PROTOTYPES(
914 	batch_set_cache_attr_state_t,
915 	pmap_batch_set_cache_attributes, (
916 #if XNU_MONITOR
917 		volatile upl_page_info_t *user_page_list,
918 #else /* !XNU_MONITOR */
919 		upl_page_info_array_t user_page_list,
920 #endif /* XNU_MONITOR */
921 		batch_set_cache_attr_state_t state,
922 		unsigned int page_cnt,
923 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
924 
925 PMAP_SUPPORT_PROTOTYPES(
926 	kern_return_t,
927 	pmap_change_wiring, (pmap_t pmap,
928 	vm_map_address_t v,
929 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
930 
931 PMAP_SUPPORT_PROTOTYPES(
932 	pmap_t,
933 	pmap_create_options, (ledger_t ledger,
934 	vm_map_size_t size,
935 	unsigned int flags,
936 	kern_return_t * kr), PMAP_CREATE_INDEX);
937 
938 PMAP_SUPPORT_PROTOTYPES(
939 	void,
940 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
941 
942 PMAP_SUPPORT_PROTOTYPES(
943 	kern_return_t,
944 	pmap_enter_options, (pmap_t pmap,
945 	vm_map_address_t v,
946 	pmap_paddr_t pa,
947 	vm_prot_t prot,
948 	vm_prot_t fault_type,
949 	unsigned int flags,
950 	boolean_t wired,
951 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
952 
953 PMAP_SUPPORT_PROTOTYPES(
954 	pmap_paddr_t,
955 	pmap_find_pa, (pmap_t pmap,
956 	addr64_t va), PMAP_FIND_PA_INDEX);
957 
958 PMAP_SUPPORT_PROTOTYPES(
959 	kern_return_t,
960 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
961 
962 
963 PMAP_SUPPORT_PROTOTYPES(
964 	boolean_t,
965 	pmap_is_empty, (pmap_t pmap,
966 	vm_map_offset_t va_start,
967 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
968 
969 
970 PMAP_SUPPORT_PROTOTYPES(
971 	unsigned int,
972 	pmap_map_cpu_windows_copy, (ppnum_t pn,
973 	vm_prot_t prot,
974 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
975 
976 PMAP_SUPPORT_PROTOTYPES(
977 	void,
978 	pmap_ro_zone_memcpy, (zone_id_t zid,
979 	vm_offset_t va,
980 	vm_offset_t offset,
981 	const vm_offset_t new_data,
982 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
983 
984 PMAP_SUPPORT_PROTOTYPES(
985 	uint64_t,
986 	pmap_ro_zone_atomic_op, (zone_id_t zid,
987 	vm_offset_t va,
988 	vm_offset_t offset,
989 	zro_atomic_op_t op,
990 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
991 
992 PMAP_SUPPORT_PROTOTYPES(
993 	void,
994 	pmap_ro_zone_bzero, (zone_id_t zid,
995 	vm_offset_t va,
996 	vm_offset_t offset,
997 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
998 
999 PMAP_SUPPORT_PROTOTYPES(
1000 	vm_map_offset_t,
1001 	pmap_nest, (pmap_t grand,
1002 	pmap_t subord,
1003 	addr64_t vstart,
1004 	uint64_t size,
1005 	vm_map_offset_t vrestart,
1006 	kern_return_t * krp), PMAP_NEST_INDEX);
1007 
1008 PMAP_SUPPORT_PROTOTYPES(
1009 	void,
1010 	pmap_page_protect_options, (ppnum_t ppnum,
1011 	vm_prot_t prot,
1012 	unsigned int options,
1013 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1014 
1015 PMAP_SUPPORT_PROTOTYPES(
1016 	vm_map_address_t,
1017 	pmap_protect_options, (pmap_t pmap,
1018 	vm_map_address_t start,
1019 	vm_map_address_t end,
1020 	vm_prot_t prot,
1021 	unsigned int options,
1022 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1023 
1024 PMAP_SUPPORT_PROTOTYPES(
1025 	kern_return_t,
1026 	pmap_query_page_info, (pmap_t pmap,
1027 	vm_map_offset_t va,
1028 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1029 
1030 PMAP_SUPPORT_PROTOTYPES(
1031 	mach_vm_size_t,
1032 	pmap_query_resident, (pmap_t pmap,
1033 	vm_map_address_t start,
1034 	vm_map_address_t end,
1035 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1036 
1037 PMAP_SUPPORT_PROTOTYPES(
1038 	void,
1039 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1040 
1041 PMAP_SUPPORT_PROTOTYPES(
1042 	vm_map_address_t,
1043 	pmap_remove_options, (pmap_t pmap,
1044 	vm_map_address_t start,
1045 	vm_map_address_t end,
1046 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1047 
1048 
1049 PMAP_SUPPORT_PROTOTYPES(
1050 	void,
1051 	pmap_set_cache_attributes, (ppnum_t pn,
1052 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1053 
1054 PMAP_SUPPORT_PROTOTYPES(
1055 	void,
1056 	pmap_update_compressor_page, (ppnum_t pn,
1057 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1058 
1059 PMAP_SUPPORT_PROTOTYPES(
1060 	void,
1061 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1062 
1063 #if MACH_ASSERT || XNU_MONITOR
1064 PMAP_SUPPORT_PROTOTYPES(
1065 	void,
1066 	pmap_set_process, (pmap_t pmap,
1067 	int pid,
1068 	char *procname), PMAP_SET_PROCESS_INDEX);
1069 #endif
1070 
1071 PMAP_SUPPORT_PROTOTYPES(
1072 	void,
1073 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1074 
1075 PMAP_SUPPORT_PROTOTYPES(
1076 	vm_map_offset_t,
1077 	pmap_unnest_options, (pmap_t grand,
1078 	addr64_t vaddr,
1079 	uint64_t size,
1080 	vm_map_offset_t vrestart,
1081 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1082 
1083 PMAP_SUPPORT_PROTOTYPES(
1084 	void,
1085 	phys_attribute_set, (ppnum_t pn,
1086 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1087 
1088 PMAP_SUPPORT_PROTOTYPES(
1089 	void,
1090 	phys_attribute_clear, (ppnum_t pn,
1091 	unsigned int bits,
1092 	int options,
1093 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1094 
1095 #if __ARM_RANGE_TLBI__
1096 PMAP_SUPPORT_PROTOTYPES(
1097 	vm_map_address_t,
1098 	phys_attribute_clear_range, (pmap_t pmap,
1099 	vm_map_address_t start,
1100 	vm_map_address_t end,
1101 	unsigned int bits,
1102 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1103 #endif /* __ARM_RANGE_TLBI__ */
1104 
1105 
1106 PMAP_SUPPORT_PROTOTYPES(
1107 	void,
1108 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1109 
1110 PMAP_SUPPORT_PROTOTYPES(
1111 	void,
1112 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1113 
1114 PMAP_SUPPORT_PROTOTYPES(
1115 	void,
1116 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1117 
1118 PMAP_SUPPORT_PROTOTYPES(
1119 	void,
1120 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1121 
1122 PMAP_SUPPORT_PROTOTYPES(
1123 	void,
1124 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1125 
1126 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1127 PMAP_SUPPORT_PROTOTYPES(
1128 	void,
1129 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1130 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1131 
1132 /* Definition of the states used by pmap_trim(). */
1133 typedef enum {
1134 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1135 	PMAP_TRIM_STATE_START = 0,
1136 
1137 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1138 	PMAP_TRIM_STATE_GRAND_BEFORE,
1139 
1140 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1141 	PMAP_TRIM_STATE_GRAND_AFTER,
1142 
1143 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1144 	PMAP_TRIM_STATE_SUBORD,
1145 
1146 	/* Marks that trimming is finished. */
1147 	PMAP_TRIM_STATE_DONE,
1148 
1149 	/* Sentry enum for sanity checks. */
1150 	PMAP_TRIM_STATE_COUNT,
1151 } pmap_trim_state_t;
1152 
1153 PMAP_SUPPORT_PROTOTYPES(
1154 	pmap_trim_state_t,
1155 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1156 
1157 #if HAS_APPLE_PAC
1158 PMAP_SUPPORT_PROTOTYPES(
1159 	void *,
1160 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1161 PMAP_SUPPORT_PROTOTYPES(
1162 	void *,
1163 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1164 #endif /* HAS_APPLE_PAC */
1165 
1166 
1167 
1168 
1169 PMAP_SUPPORT_PROTOTYPES(
1170 	kern_return_t,
1171 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1172 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1173 
1174 PMAP_SUPPORT_PROTOTYPES(
1175 	kern_return_t,
1176 	pmap_load_trust_cache_with_type, (TCType_t type,
1177 	const vm_address_t pmap_img4_payload,
1178 	const vm_size_t pmap_img4_payload_len,
1179 	const vm_address_t img4_manifest,
1180 	const vm_size_t img4_manifest_len,
1181 	const vm_address_t img4_aux_manifest,
1182 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1183 
1184 PMAP_SUPPORT_PROTOTYPES(
1185 	void,
1186 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1187 
1188 PMAP_SUPPORT_PROTOTYPES(
1189 	kern_return_t,
1190 	pmap_query_trust_cache, (TCQueryType_t query_type,
1191 	const uint8_t cdhash[kTCEntryHashSize],
1192 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1193 
1194 #if PMAP_CS_INCLUDE_CODE_SIGNING
1195 
1196 PMAP_SUPPORT_PROTOTYPES(
1197 	kern_return_t,
1198 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1199 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1200 
1201 PMAP_SUPPORT_PROTOTYPES(
1202 	kern_return_t,
1203 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1204 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1205 
1206 PMAP_SUPPORT_PROTOTYPES(
1207 	kern_return_t,
1208 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1209 	pmap_cs_profile_t * profile_obj),
1210 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1211 
1212 PMAP_SUPPORT_PROTOTYPES(
1213 	kern_return_t,
1214 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1215 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1216 
1217 PMAP_SUPPORT_PROTOTYPES(
1218 	kern_return_t,
1219 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1220 	const void *kernel_entitlements),
1221 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1222 
1223 PMAP_SUPPORT_PROTOTYPES(
1224 	kern_return_t,
1225 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1226 	const void **kernel_entitlements),
1227 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1228 
1229 PMAP_SUPPORT_PROTOTYPES(
1230 	kern_return_t,
1231 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1232 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1233 
1234 PMAP_SUPPORT_PROTOTYPES(
1235 	kern_return_t,
1236 	pmap_cs_allow_invalid, (pmap_t pmap),
1237 	PMAP_CS_ALLOW_INVALID_INDEX);
1238 
1239 PMAP_SUPPORT_PROTOTYPES(
1240 	void,
1241 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1242 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1243 
1244 PMAP_SUPPORT_PROTOTYPES(
1245 	bool,
1246 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1247 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1248 
1249 PMAP_SUPPORT_PROTOTYPES(
1250 	void,
1251 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1252 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1253 
1254 PMAP_SUPPORT_PROTOTYPES(
1255 	void,
1256 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1257 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1258 
1259 #endif
1260 
1261 PMAP_SUPPORT_PROTOTYPES(
1262 	uint32_t,
1263 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1264 
1265 PMAP_SUPPORT_PROTOTYPES(
1266 	bool,
1267 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1268 
1269 PMAP_SUPPORT_PROTOTYPES(
1270 	void,
1271 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1272 
1273 void pmap_footprint_suspend(vm_map_t    map,
1274     boolean_t   suspend);
1275 PMAP_SUPPORT_PROTOTYPES(
1276 	void,
1277 	pmap_footprint_suspend, (vm_map_t map,
1278 	boolean_t suspend),
1279 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1280 
1281 
1282 
1283 
1284 #if DEVELOPMENT || DEBUG
1285 PMAP_SUPPORT_PROTOTYPES(
1286 	kern_return_t,
1287 	pmap_test_text_corruption, (pmap_paddr_t),
1288 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1289 #endif /* DEVELOPMENT || DEBUG */
1290 
1291 /*
1292  * The low global vector page is mapped at a fixed alias.
1293  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1294  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1295  * to check both addresses anyway for backward compatibility. So for now
1296  * we leave H6 and H7 where they were.
1297  */
1298 #if (ARM_PGSHIFT == 14)
1299 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1300 #else
1301 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1302 #endif
1303 
1304 
1305 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1306 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1307 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1308 
1309 #if XNU_MONITOR
1310 
1311 #if __has_feature(ptrauth_calls)
1312 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1313 #else
1314 #define __ptrauth_ppl_handler
1315 #endif
1316 
1317 /*
1318  * Table of function pointers used for PPL dispatch.
1319  */
1320 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1321 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1322 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1323 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1324 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1325 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1326 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1327 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1328 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1329 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1330 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1331 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1332 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1333 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1334 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1335 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1336 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1337 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1338 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1339 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1340 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1341 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1342 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1343 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1344 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1345 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1346 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1347 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1348 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1349 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1350 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1351 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1352 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1353 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1354 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1355 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1356 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1357 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1358 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1359 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1360 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1361 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1362 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1363 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1364 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1365 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1366 #if PMAP_CS_INCLUDE_CODE_SIGNING
1367 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1368 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1369 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1370 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1371 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1372 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1373 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1374 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1375 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1376 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1377 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1378 #endif
1379 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1380 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1381 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1382 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1383 #if HAS_APPLE_PAC
1384 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1385 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1386 #endif /* HAS_APPLE_PAC */
1387 #if __ARM_RANGE_TLBI__
1388 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1389 #endif /* __ARM_RANGE_TLBI__ */
1390 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1391 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1392 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1393 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1394 
1395 #if DEVELOPMENT || DEBUG
1396 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1397 #endif /* DEVELOPMENT || DEBUG */
1398 
1399 };
1400 #endif
1401 
1402 #if XNU_MONITOR
1403 /**
1404  * A convenience function for setting protections on a single physical
1405  * aperture or static region mapping without invalidating the TLB.
1406  *
1407  * @note This function does not perform any TLB invalidations. That must be done
1408  *       separately to be able to safely use the updated mapping.
1409  *
1410  * @note This function understands the difference between the VM page size and
1411  *       the kernel page size and will update multiple PTEs if the sizes differ.
1412  *       In other words, enough PTEs will always get updated to change the
1413  *       permissions on a PAGE_SIZE amount of memory.
1414  *
1415  * @note The PVH lock for the physical page represented by this mapping must
1416  *       already be locked.
1417  *
1418  * @note This function assumes the caller has already verified that the PTE
1419  *       pointer does indeed point to a physical aperture or static region page
1420  *       table. Please validate your inputs before passing it along to this
1421  *       function.
1422  *
1423  * @param ptep Pointer to the physical aperture or static region page table to
1424  *             update with a new XPRR index.
1425  * @param expected_perm The XPRR index that is expected to already exist at the
1426  *                      current mapping. If the current index doesn't match this
1427  *                      then the system will panic.
1428  * @param new_perm The new XPRR index to update the mapping with.
1429  */
1430 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1431 pmap_set_pte_xprr_perm(
1432 	pt_entry_t * const ptep,
1433 	unsigned int expected_perm,
1434 	unsigned int new_perm)
1435 {
1436 	assert(ptep != NULL);
1437 
1438 	pt_entry_t spte = *ptep;
1439 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1440 
1441 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1442 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1443 		    __func__, ptep, new_perm, expected_perm);
1444 	}
1445 
1446 	/**
1447 	 * The PTE involved should be valid, should not have the hint bit set, and
1448 	 * should have the expected XPRR index.
1449 	 */
1450 	if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1451 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1452 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1453 		    __func__, ptep, spte, new_perm, expected_perm);
1454 	}
1455 
1456 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1457 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1458 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1459 		    __func__, ptep, spte, new_perm, expected_perm);
1460 	}
1461 
1462 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1463 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1464 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1465 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1466 	}
1467 
1468 	pt_entry_t template = spte;
1469 	template &= ~ARM_PTE_XPRR_MASK;
1470 	template |= xprr_perm_to_pte(new_perm);
1471 
1472 	write_pte_strong(ptep, template);
1473 }
1474 
1475 /**
1476  * Update the protections on a single physical aperture mapping and invalidate
1477  * the TLB so the mapping can be used.
1478  *
1479  * @note The PVH lock for the physical page must already be locked.
1480  *
1481  * @param pai The physical address index of the page whose physical aperture
1482  *            mapping will be updated with new permissions.
1483  * @param expected_perm The XPRR index that is expected to already exist at the
1484  *                      current mapping. If the current index doesn't match this
1485  *                      then the system will panic.
1486  * @param new_perm The new XPRR index to update the mapping with.
1487  */
1488 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1489 pmap_set_xprr_perm(
1490 	unsigned int pai,
1491 	unsigned int expected_perm,
1492 	unsigned int new_perm)
1493 {
1494 	pvh_assert_locked(pai);
1495 
1496 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1497 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1498 
1499 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1500 
1501 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1502 	sync_tlb_flush();
1503 }
1504 
1505 /**
1506  * Update the protections on a range of physical aperture or static region
1507  * mappings and invalidate the TLB so the mappings can be used.
1508  *
1509  * @note Static region mappings can only be updated before machine_lockdown().
1510  *       Physical aperture mappings can be updated at any time.
1511  *
1512  * @param start The starting virtual address of the static region or physical
1513  *              aperture range whose permissions will be updated.
1514  * @param end The final (inclusive) virtual address of the static region or
1515  *            physical aperture range whose permissions will be updated.
1516  * @param expected_perm The XPRR index that is expected to already exist at the
1517  *                      current mappings. If the current indices don't match
1518  *                      this then the system will panic.
1519  * @param new_perm The new XPRR index to update the mappings with.
1520  */
1521 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1522 pmap_set_range_xprr_perm(
1523 	vm_address_t start,
1524 	vm_address_t end,
1525 	unsigned int expected_perm,
1526 	unsigned int new_perm)
1527 {
1528 	/**
1529 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1530 	 */
1531 	if (__improbable((start | end) & ARM_PGMASK)) {
1532 		panic_plain("%s: start or end not page aligned, "
1533 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1534 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1535 	}
1536 
1537 	if (__improbable(start > end)) {
1538 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1539 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1540 	}
1541 
1542 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1543 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1544 
1545 	if (__improbable(!(in_physmap || in_static))) {
1546 		panic_plain("%s: address not in static region or physical aperture, "
1547 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1548 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1549 	}
1550 
1551 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1552 		panic_plain("%s: invalid XPRR index, "
1553 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1554 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1555 	}
1556 
1557 	/*
1558 	 * Walk over the PTEs for the given range, and set the protections on those
1559 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1560 	 * one twig entry (whichever twig entry currently maps "va").
1561 	 */
1562 	vm_address_t va = start;
1563 	while (va < end) {
1564 		/**
1565 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1566 		 * PTEs from va to tte_va_end will have their permissions updated.
1567 		 */
1568 		vm_address_t tte_va_end =
1569 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1570 
1571 		if (tte_va_end > end) {
1572 			tte_va_end = end;
1573 		}
1574 
1575 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1576 
1577 		if (ttep == NULL) {
1578 			panic_plain("%s: physical aperture or static region tte is NULL, "
1579 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1580 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1581 		}
1582 
1583 		tt_entry_t tte = *ttep;
1584 
1585 		if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1586 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1587 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1588 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1589 		}
1590 
1591 		/* Walk over the given L3 page table page and update the PTEs. */
1592 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1593 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1594 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1595 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1596 
1597 		/**
1598 		 * The current PTE pointer is incremented by the page ratio (ratio of
1599 		 * VM page size to kernel hardware page size) because one call to
1600 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1601 		 * a PAGE_SIZE worth of hardware pages.
1602 		 */
1603 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1604 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1605 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1606 			pvh_lock(pai);
1607 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1608 			pvh_unlock(pai);
1609 		}
1610 
1611 		va = tte_va_end;
1612 	}
1613 
1614 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1615 }
1616 
1617 #endif /* XNU_MONITOR */
1618 
1619 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1620 PMAP_ZINFO_PALLOC(
1621 	pmap_t pmap, int bytes)
1622 {
1623 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1624 }
1625 
1626 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1627 PMAP_ZINFO_PFREE(
1628 	pmap_t pmap,
1629 	int bytes)
1630 {
1631 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1632 }
1633 
1634 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1635 pmap_tt_ledger_credit(
1636 	pmap_t          pmap,
1637 	vm_size_t       size)
1638 {
1639 	if (pmap != kernel_pmap) {
1640 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1641 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1642 	}
1643 }
1644 
1645 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1646 pmap_tt_ledger_debit(
1647 	pmap_t          pmap,
1648 	vm_size_t       size)
1649 {
1650 	if (pmap != kernel_pmap) {
1651 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1652 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1653 	}
1654 }
1655 
1656 static inline void
pmap_update_plru(uint16_t asid_index __unused)1657 pmap_update_plru(uint16_t asid_index __unused)
1658 {
1659 #if !HAS_16BIT_ASID
1660 	if (__probable(pmap_asid_plru)) {
1661 		unsigned plru_index = asid_index >> 6;
1662 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1663 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1664 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1665 		}
1666 	}
1667 #endif /* !HAS_16BIT_ASID */
1668 }
1669 
1670 static bool
alloc_asid(pmap_t pmap)1671 alloc_asid(pmap_t pmap)
1672 {
1673 	int vasid = -1;
1674 	uint16_t hw_asid;
1675 
1676 	pmap_simple_lock(&asid_lock);
1677 
1678 #if !HAS_16BIT_ASID
1679 	if (__probable(pmap_asid_plru)) {
1680 		unsigned plru_index = 0;
1681 		uint64_t lowest_gen = asid_plru_generation[0];
1682 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1683 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1684 			if (asid_plru_generation[i] < lowest_gen) {
1685 				plru_index = i;
1686 				lowest_gen = asid_plru_generation[i];
1687 				lowest_gen_bitmap = asid_plru_bitmap[i];
1688 			}
1689 		}
1690 
1691 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1692 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1693 			if (temp_plru) {
1694 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1695 #if DEVELOPMENT || DEBUG
1696 				++pmap_asid_hits;
1697 #endif
1698 				break;
1699 			}
1700 		}
1701 	}
1702 #else
1703 	/**
1704 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1705 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1706 	 * However, we first try to allocate starting from the position of the most-recently allocated
1707 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1708 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1709 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1710 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1711 	 * logic, without requiring prohibitively expensive RCTX instructions.
1712 	 */
1713 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1714 #endif /* !HAS_16BIT_ASID */
1715 	if (__improbable(vasid < 0)) {
1716 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1717 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1718 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1719 #if DEVELOPMENT || DEBUG
1720 		++pmap_asid_misses;
1721 #endif
1722 	}
1723 	if (__improbable(vasid < 0)) {
1724 		pmap_simple_unlock(&asid_lock);
1725 		return false;
1726 	}
1727 	assert((uint32_t)vasid < pmap_max_asids);
1728 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1729 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1730 #if HAS_16BIT_ASID
1731 	last_allocated_asid = (uint16_t)vasid;
1732 #endif /* HAS_16BIT_ASID */
1733 	pmap_simple_unlock(&asid_lock);
1734 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1735 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1736 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1737 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1738 		 * reassign to a reserved VASID. */
1739 		assert(pmap->sw_asid < UINT8_MAX);
1740 		pmap->sw_asid = UINT8_MAX;
1741 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1742 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1743 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1744 		assert(hw_asid < MAX_HW_ASIDS);
1745 	}
1746 	pmap_update_plru(hw_asid);
1747 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1748 #if __ARM_KERNEL_PROTECT__
1749 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1750 #endif
1751 	pmap->hw_asid = hw_asid;
1752 	return true;
1753 }
1754 
1755 static void
free_asid(pmap_t pmap)1756 free_asid(pmap_t pmap)
1757 {
1758 	unsigned int vasid;
1759 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1760 	if (__improbable(hw_asid == 0)) {
1761 		return;
1762 	}
1763 
1764 #if __ARM_KERNEL_PROTECT__
1765 	hw_asid >>= 1;
1766 #endif
1767 	hw_asid -= 1;
1768 
1769 #if HAS_16BIT_ASID
1770 	vasid = hw_asid;
1771 #else
1772 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1773 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1774 	} else {
1775 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1776 	}
1777 
1778 	if (__probable(pmap_asid_plru)) {
1779 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1780 	}
1781 #endif /* HAS_16BIT_ASID */
1782 	pmap_simple_lock(&asid_lock);
1783 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1784 	bitmap_set(&asid_bitmap[0], vasid);
1785 	pmap_simple_unlock(&asid_lock);
1786 }
1787 
1788 
1789 boolean_t
pmap_valid_address(pmap_paddr_t addr)1790 pmap_valid_address(
1791 	pmap_paddr_t addr)
1792 {
1793 	return pa_valid(addr);
1794 }
1795 
1796 
1797 
1798 
1799 
1800 
1801 /*
1802  *      Map memory at initialization.  The physical addresses being
1803  *      mapped are not managed and are never unmapped.
1804  *
1805  *      For now, VM is already on, we only need to map the
1806  *      specified memory.
1807  */
1808 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1809 pmap_map(
1810 	vm_map_address_t virt,
1811 	vm_offset_t start,
1812 	vm_offset_t end,
1813 	vm_prot_t prot,
1814 	unsigned int flags)
1815 {
1816 	kern_return_t   kr;
1817 	vm_size_t       ps;
1818 
1819 	ps = PAGE_SIZE;
1820 	while (start < end) {
1821 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1822 		    prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1823 
1824 		if (kr != KERN_SUCCESS) {
1825 			panic("%s: failed pmap_enter, "
1826 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1827 			    __FUNCTION__,
1828 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1829 		}
1830 
1831 		virt += ps;
1832 		start += ps;
1833 	}
1834 	return virt;
1835 }
1836 
1837 #if XNU_MONITOR
1838 /**
1839  * Remove kernel writeablity from an IO PTE value if the page is owned by
1840  * guarded mode software.
1841  *
1842  * @param paddr The physical address of the page which has to be non-DRAM.
1843  * @param tmplate The PTE value to be evaluated.
1844  *
1845  * @return A new PTE value with permission bits modified.
1846  */
1847 static inline
1848 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1849 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1850 {
1851 	assert(!pa_valid(paddr));
1852 
1853 	const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1854 
1855 	if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1856 		/* PPL to own the page by converting KERN_RW to PPL_RW. */
1857 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1858 		switch (xprr_perm) {
1859 		case XPRR_KERN_RO_PERM:
1860 			break;
1861 		case XPRR_KERN_RW_PERM:
1862 			tmplate &= ~ARM_PTE_XPRR_MASK;
1863 			tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1864 			break;
1865 		default:
1866 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1867 		}
1868 	}
1869 
1870 	return tmplate;
1871 }
1872 #endif /* XNU_MONITOR */
1873 
1874 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1875 pmap_map_bd_with_options(
1876 	vm_map_address_t virt,
1877 	vm_offset_t start,
1878 	vm_offset_t end,
1879 	vm_prot_t prot,
1880 	int32_t options)
1881 {
1882 	pt_entry_t      mem_attr;
1883 
1884 	switch (options & PMAP_MAP_BD_MASK) {
1885 	case PMAP_MAP_BD_WCOMB:
1886 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1887 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1888 		break;
1889 	case PMAP_MAP_BD_POSTED:
1890 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1891 		break;
1892 	case PMAP_MAP_BD_POSTED_REORDERED:
1893 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1894 		break;
1895 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1896 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1897 		break;
1898 	default:
1899 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1900 		break;
1901 	}
1902 
1903 	/* not cacheable and not buffered */
1904 	pt_entry_t tmplate = pa_to_pte(start)
1905 	    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1906 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1907 	    | mem_attr;
1908 
1909 #if __ARM_KERNEL_PROTECT__
1910 	tmplate |= ARM_PTE_NG;
1911 #endif /* __ARM_KERNEL_PROTECT__ */
1912 
1913 	vm_map_address_t vaddr = virt;
1914 	vm_offset_t paddr = start;
1915 	while (paddr < end) {
1916 		pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
1917 		if (ptep == PT_ENTRY_NULL) {
1918 			panic("pmap_map_bd");
1919 		}
1920 
1921 		/**
1922 		 * For every iteration, the paddr encoded in tmplate is incrementing,
1923 		 * but we always start with the original AP bits defined at the top
1924 		 * of the function in tmplate and only modify the AP bits in the pte
1925 		 * variable.
1926 		 */
1927 		pt_entry_t pte;
1928 #if XNU_MONITOR
1929 		if (!pa_valid(paddr)) {
1930 			pte = pmap_construct_io_pte(paddr, tmplate);
1931 		} else {
1932 			pte = tmplate;
1933 		}
1934 #else /* !XNU_MONITOR */
1935 		pte = tmplate;
1936 #endif
1937 
1938 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1939 		write_pte_strong(ptep, pte);
1940 
1941 		pte_increment_pa(tmplate);
1942 		vaddr += PAGE_SIZE;
1943 		paddr += PAGE_SIZE;
1944 	}
1945 
1946 	if (end >= start) {
1947 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1948 	}
1949 
1950 	return vaddr;
1951 }
1952 
1953 /*
1954  *      Back-door routine for mapping kernel VM at initialization.
1955  *      Useful for mapping memory outside the range
1956  *      [vm_first_phys, vm_last_phys] (i.e., devices).
1957  *      Otherwise like pmap_map.
1958  */
1959 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1960 pmap_map_bd(
1961 	vm_map_address_t virt,
1962 	vm_offset_t start,
1963 	vm_offset_t end,
1964 	vm_prot_t prot)
1965 {
1966 	return pmap_map_bd_with_options(virt, start, end, prot, 0);
1967 }
1968 
1969 /*
1970  *      Back-door routine for mapping kernel VM at initialization.
1971  *      Useful for mapping memory specific physical addresses in early
1972  *      boot (i.e., before kernel_map is initialized).
1973  *
1974  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
1975  */
1976 
1977 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1978 pmap_map_high_window_bd(
1979 	vm_offset_t pa_start,
1980 	vm_size_t len,
1981 	vm_prot_t prot)
1982 {
1983 	pt_entry_t              *ptep, pte;
1984 	vm_map_address_t        va_start = VREGION1_START;
1985 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
1986 	vm_map_address_t        va_end;
1987 	vm_map_address_t        va;
1988 	vm_size_t               offset;
1989 
1990 	offset = pa_start & PAGE_MASK;
1991 	pa_start -= offset;
1992 	len += offset;
1993 
1994 	if (len > (va_max - va_start)) {
1995 		panic("%s: area too large, "
1996 		    "pa_start=%p, len=%p, prot=0x%x",
1997 		    __FUNCTION__,
1998 		    (void*)pa_start, (void*)len, prot);
1999 	}
2000 
2001 scan:
2002 	for (; va_start < va_max; va_start += PAGE_SIZE) {
2003 		ptep = pmap_pte(kernel_pmap, va_start);
2004 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2005 		if (*ptep == ARM_PTE_TYPE_FAULT) {
2006 			break;
2007 		}
2008 	}
2009 	if (va_start > va_max) {
2010 		panic("%s: insufficient pages, "
2011 		    "pa_start=%p, len=%p, prot=0x%x",
2012 		    __FUNCTION__,
2013 		    (void*)pa_start, (void*)len, prot);
2014 	}
2015 
2016 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2017 		ptep = pmap_pte(kernel_pmap, va_end);
2018 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2019 		if (*ptep != ARM_PTE_TYPE_FAULT) {
2020 			va_start = va_end + PAGE_SIZE;
2021 			goto scan;
2022 		}
2023 	}
2024 
2025 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2026 		ptep = pmap_pte(kernel_pmap, va);
2027 		pte = pa_to_pte(pa_start)
2028 		    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2029 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2030 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2031 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2032 #if __ARM_KERNEL_PROTECT__
2033 		pte |= ARM_PTE_NG;
2034 #endif /* __ARM_KERNEL_PROTECT__ */
2035 		write_pte_strong(ptep, pte);
2036 	}
2037 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2038 #if KASAN
2039 	kasan_notify_address(va_start, len);
2040 #endif
2041 	return va_start;
2042 }
2043 
2044 static uint32_t
pmap_compute_max_asids(void)2045 pmap_compute_max_asids(void)
2046 {
2047 	DTEntry entry;
2048 	void const *prop = NULL;
2049 	uint32_t max_asids;
2050 	int err;
2051 	unsigned int prop_size;
2052 
2053 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2054 	assert(err == kSuccess);
2055 
2056 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2057 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
2058 		 * we can choose a more flexible default value here. */
2059 		return MAX_ASIDS;
2060 	}
2061 
2062 	if (prop_size != sizeof(max_asids)) {
2063 		panic("pmap-max-asids property is not a 32-bit integer");
2064 	}
2065 
2066 	max_asids = *((uint32_t const *)prop);
2067 #if HAS_16BIT_ASID
2068 	if (max_asids > MAX_HW_ASIDS) {
2069 		panic("pmap-max-asids 0x%x too large", max_asids);
2070 	}
2071 #else
2072 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2073 	max_asids = (max_asids + 63) & ~63UL;
2074 
2075 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2076 		/* currently capped by size of pmap->sw_asid */
2077 		panic("pmap-max-asids 0x%x too large", max_asids);
2078 	}
2079 #endif /* HAS_16BIT_ASID */
2080 	if (max_asids == 0) {
2081 		panic("pmap-max-asids cannot be zero");
2082 	}
2083 	return max_asids;
2084 }
2085 
2086 #if __arm64__
2087 /*
2088  * pmap_get_arm64_prot
2089  *
2090  * return effective armv8 VMSA block protections including
2091  * table AP/PXN/XN overrides of a pmap entry
2092  *
2093  */
2094 
2095 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2096 pmap_get_arm64_prot(
2097 	pmap_t pmap,
2098 	vm_offset_t addr)
2099 {
2100 	tt_entry_t tte = 0;
2101 	unsigned int level = 0;
2102 	uint64_t tte_type = 0;
2103 	uint64_t effective_prot_bits = 0;
2104 	uint64_t aggregate_tte = 0;
2105 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2106 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2107 
2108 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2109 		tte = *pmap_ttne(pmap, level, addr);
2110 
2111 		if (!(tte & ARM_TTE_VALID)) {
2112 			return 0;
2113 		}
2114 
2115 		tte_type = tte & ARM_TTE_TYPE_MASK;
2116 
2117 		if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2118 		    (level == pt_attr->pta_max_level)) {
2119 			/* Block or page mapping; both have the same protection bit layout. */
2120 			break;
2121 		} else if (tte_type == ARM_TTE_TYPE_TABLE) {
2122 			/* All of the table bits we care about are overrides, so just OR them together. */
2123 			aggregate_tte |= tte;
2124 		}
2125 	}
2126 
2127 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2128 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2129 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2130 
2131 	/* Start with the PTE bits. */
2132 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2133 
2134 	/* Table AP bits mask out block/page AP bits */
2135 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2136 
2137 	/* XN/PXN bits can be OR'd in. */
2138 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2139 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2140 
2141 	return effective_prot_bits;
2142 }
2143 #endif /* __arm64__ */
2144 
2145 /*
2146  *	Bootstrap the system enough to run with virtual memory.
2147  *
2148  *	The early VM initialization code has already allocated
2149  *	the first CPU's translation table and made entries for
2150  *	all the one-to-one mappings to be found there.
2151  *
2152  *	We must set up the kernel pmap structures, the
2153  *	physical-to-virtual translation lookup tables for the
2154  *	physical memory to be managed (between avail_start and
2155  *	avail_end).
2156  *
2157  *	Map the kernel's code and data, and allocate the system page table.
2158  *	Page_size must already be set.
2159  *
2160  *	Parameters:
2161  *	first_avail	first available physical page -
2162  *			   after kernel page tables
2163  *	avail_start	PA of first managed physical page
2164  *	avail_end	PA of last managed physical page
2165  */
2166 
2167 void
pmap_bootstrap(vm_offset_t vstart)2168 pmap_bootstrap(
2169 	vm_offset_t vstart)
2170 {
2171 	vm_map_offset_t maxoffset;
2172 
2173 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2174 
2175 #if XNU_MONITOR
2176 
2177 #if DEVELOPMENT || DEBUG
2178 	PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2179 #endif
2180 
2181 #if CONFIG_CSR_FROM_DT
2182 	if (csr_unsafe_kernel_text) {
2183 		pmap_ppl_disable = true;
2184 	}
2185 #endif /* CONFIG_CSR_FROM_DT */
2186 
2187 #endif /* XNU_MONITOR */
2188 
2189 #if DEVELOPMENT || DEBUG
2190 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2191 		kprintf("Kernel traces for pmap operations enabled\n");
2192 	}
2193 #endif
2194 
2195 	/*
2196 	 *	Initialize the kernel pmap.
2197 	 */
2198 #if ARM_PARAMETERIZED_PMAP
2199 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2200 #endif /* ARM_PARAMETERIZED_PMAP */
2201 #if HAS_APPLE_PAC
2202 	kernel_pmap->disable_jop = 0;
2203 #endif /* HAS_APPLE_PAC */
2204 	kernel_pmap->tte = cpu_tte;
2205 	kernel_pmap->ttep = cpu_ttep;
2206 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2207 	kernel_pmap->max = UINTPTR_MAX;
2208 	os_atomic_init(&kernel_pmap->ref_count, 1);
2209 #if XNU_MONITOR
2210 	os_atomic_init(&kernel_pmap->nested_count, 0);
2211 #endif
2212 	kernel_pmap->nx_enabled = TRUE;
2213 #ifdef  __arm64__
2214 	kernel_pmap->is_64bit = TRUE;
2215 #else
2216 	kernel_pmap->is_64bit = FALSE;
2217 #endif
2218 #if CONFIG_ROSETTA
2219 	kernel_pmap->is_rosetta = FALSE;
2220 #endif
2221 
2222 #if ARM_PARAMETERIZED_PMAP
2223 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2224 #endif /* ARM_PARAMETERIZED_PMAP */
2225 
2226 	kernel_pmap->nested_region_addr = 0x0ULL;
2227 	kernel_pmap->nested_region_size = 0x0ULL;
2228 	kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2229 	kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2230 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2231 
2232 	kernel_pmap->hw_asid = 0;
2233 	kernel_pmap->sw_asid = 0;
2234 
2235 	pmap_lock_init(kernel_pmap);
2236 
2237 	pmap_max_asids = pmap_compute_max_asids();
2238 #if HAS_16BIT_ASID
2239 	asid_chunk_size = MAX_HW_ASIDS;
2240 #else
2241 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2242 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2243 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2244 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2245 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2246 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2247 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2248 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2249 #endif /* HAS_16BIT_ASIDS */
2250 
2251 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2252 
2253 	/**
2254 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2255 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2256 	 * space for these data structures.
2257 	 */
2258 	pmap_data_bootstrap();
2259 
2260 	/**
2261 	 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2262 	 */
2263 	uat_bootstrap();
2264 
2265 
2266 	/**
2267 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2268 	 */
2269 	sart_bootstrap();
2270 
2271 	/**
2272 	 * Don't make any assumptions about the alignment of avail_start before this
2273 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2274 	 */
2275 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2276 
2277 	const pmap_paddr_t pmap_struct_start = avail_start;
2278 
2279 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2280 	avail_start = round_page(avail_start + asid_table_size);
2281 
2282 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2283 
2284 	vm_first_phys = gPhysBase;
2285 	vm_last_phys = trunc_page(avail_end);
2286 
2287 	queue_init(&map_pmap_list);
2288 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2289 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2290 	free_page_size_tt_count = 0;
2291 	free_page_size_tt_max = 0;
2292 	free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2293 	free_two_page_size_tt_count = 0;
2294 	free_two_page_size_tt_max = 0;
2295 	free_tt_list = TT_FREE_ENTRY_NULL;
2296 	free_tt_count = 0;
2297 	free_tt_max = 0;
2298 
2299 	virtual_space_start = vstart;
2300 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2301 
2302 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2303 #if !HAS_16BIT_ASID
2304 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2305 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2306 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2307 #endif /* !HAS_16BIT_ASID */
2308 
2309 
2310 
2311 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2312 		maxoffset = trunc_page(maxoffset);
2313 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2314 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2315 			arm_pmap_max_offset_default = maxoffset;
2316 		}
2317 	}
2318 #if defined(__arm64__)
2319 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2320 		maxoffset = trunc_page(maxoffset);
2321 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2322 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2323 			arm64_pmap_max_offset_default = maxoffset;
2324 		}
2325 	}
2326 #endif
2327 
2328 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2329 
2330 
2331 #if PMAP_CS_PPL_MONITOR
2332 	/* Initialize the PPL trust cache read-write lock */
2333 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2334 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2335 #endif
2336 
2337 #if MACH_ASSERT
2338 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2339 	    &vm_footprint_suspend_allowed,
2340 	    sizeof(vm_footprint_suspend_allowed));
2341 #endif /* MACH_ASSERT */
2342 
2343 #if KASAN
2344 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2345 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2346 #endif /* KASAN */
2347 
2348 	/**
2349 	 * Ensure that avail_start is always left on a page boundary. The calling
2350 	 * code might not perform any alignment before allocating page tables so
2351 	 * this is important.
2352 	 */
2353 	avail_start = round_page(avail_start);
2354 }
2355 
2356 #if XNU_MONITOR
2357 
2358 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2359 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2360 {
2361 	pmap_paddr_t cur_pa;
2362 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2363 		assert(pa_valid(cur_pa));
2364 		ppattr_pa_set_monitor(cur_pa);
2365 	}
2366 }
2367 
2368 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2369 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2370     pmap_paddr_t end_pa,
2371     unsigned int expected_perm,
2372     unsigned int new_perm)
2373 {
2374 	vm_offset_t start_va = phystokv(start_pa);
2375 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2376 
2377 	pa_set_range_monitor(start_pa, end_pa);
2378 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2379 }
2380 
2381 static void
pmap_lockdown_kc(void)2382 pmap_lockdown_kc(void)
2383 {
2384 	extern vm_offset_t vm_kernelcache_base;
2385 	extern vm_offset_t vm_kernelcache_top;
2386 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2387 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2388 	pmap_paddr_t cur_pa = start_pa;
2389 	vm_offset_t cur_va = vm_kernelcache_base;
2390 	while (cur_pa < end_pa) {
2391 		vm_size_t range_size = end_pa - cur_pa;
2392 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2393 		if (ptov_va != cur_va) {
2394 			/*
2395 			 * If the physical address maps back to a virtual address that is non-linear
2396 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2397 			 * reclaimed by the OS and should therefore not be locked down.
2398 			 */
2399 			cur_pa += range_size;
2400 			cur_va += range_size;
2401 			continue;
2402 		}
2403 		unsigned int pai = pa_index(cur_pa);
2404 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2405 
2406 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2407 
2408 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2409 			panic("pai %d already locked down", pai);
2410 		}
2411 
2412 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2413 		cur_pa += ARM_PGBYTES;
2414 		cur_va += ARM_PGBYTES;
2415 	}
2416 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2417 	extern uint64_t ctrr_ro_test;
2418 	extern uint64_t ctrr_nx_test;
2419 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2420 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2421 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2422 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2423 	}
2424 #endif
2425 }
2426 
2427 void
pmap_static_allocations_done(void)2428 pmap_static_allocations_done(void)
2429 {
2430 	pmap_paddr_t monitor_start_pa;
2431 	pmap_paddr_t monitor_end_pa;
2432 
2433 	/*
2434 	 * Protect the bootstrap (V=P and V->P) page tables.
2435 	 *
2436 	 * These bootstrap allocations will be used primarily for page tables.
2437 	 * If we wish to secure the page tables, we need to start by marking
2438 	 * these bootstrap allocations as pages that we want to protect.
2439 	 */
2440 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2441 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2442 
2443 	/* The bootstrap page tables are mapped RW at boostrap. */
2444 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2445 
2446 	/*
2447 	 * We use avail_start as a pointer to the first address that has not
2448 	 * been reserved for bootstrap, so we know which pages to give to the
2449 	 * virtual memory layer.
2450 	 */
2451 	monitor_start_pa = first_avail_phys;
2452 	monitor_end_pa = avail_start;
2453 
2454 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2455 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2456 
2457 	/*
2458 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2459 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2460 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2461 	 * they can't be allocated for other uses.  We don't need a special xPRR
2462 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2463 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2464 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2465 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2466 	 * to believe we are dealing with an user XO page upon performing a translation.
2467 	 */
2468 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2469 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2470 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2471 
2472 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2473 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2474 
2475 	/* PPL data is RW for the PPL, RO for the kernel. */
2476 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2477 
2478 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2479 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2480 
2481 	/* PPL text is RX for the PPL, RO for the kernel. */
2482 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2483 
2484 
2485 	/*
2486 	 * In order to support DTrace, the save areas for the PPL must be
2487 	 * writable.  This is due to the fact that DTrace will try to update
2488 	 * register state.
2489 	 */
2490 	if (pmap_ppl_disable) {
2491 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2492 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2493 
2494 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2495 	}
2496 
2497 
2498 	if (segSizePPLDATACONST > 0) {
2499 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2500 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2501 
2502 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2503 	}
2504 
2505 	/*
2506 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2507 	 * precaution.  The real RW mappings are at a different location with guard pages.
2508 	 */
2509 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2510 
2511 	/* Prevent remapping of the kernelcache */
2512 	pmap_lockdown_kc();
2513 }
2514 
2515 void
pmap_lockdown_ppl(void)2516 pmap_lockdown_ppl(void)
2517 {
2518 	/* Mark the PPL as being locked down. */
2519 
2520 	mp_disable_preemption(); // for _nopreempt locking operations
2521 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2522 	if (commpage_text_kva != 0) {
2523 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2524 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2525 	}
2526 	mp_enable_preemption();
2527 
2528 	/* Write-protect the kernel RO commpage. */
2529 #error "XPRR configuration error"
2530 }
2531 #endif /* XNU_MONITOR */
2532 
2533 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2534 pmap_virtual_space(
2535 	vm_offset_t *startp,
2536 	vm_offset_t *endp
2537 	)
2538 {
2539 	*startp = virtual_space_start;
2540 	*endp = virtual_space_end;
2541 }
2542 
2543 
2544 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2545 pmap_virtual_region(
2546 	unsigned int region_select,
2547 	vm_map_offset_t *startp,
2548 	vm_map_size_t *size
2549 	)
2550 {
2551 	boolean_t       ret = FALSE;
2552 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2553 	if (region_select == 0) {
2554 		/*
2555 		 * In this config, the bootstrap mappings should occupy their own L2
2556 		 * TTs, as they should be immutable after boot.  Having the associated
2557 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2558 		 * while allowing the rest of the kernel address range to be remapped.
2559 		 */
2560 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2561 #if defined(ARM_LARGE_MEMORY)
2562 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2563 #else
2564 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2565 #endif
2566 		ret = TRUE;
2567 	}
2568 
2569 #if defined(ARM_LARGE_MEMORY)
2570 	if (region_select == 1) {
2571 		*startp = VREGION1_START;
2572 		*size = VREGION1_SIZE;
2573 		ret = TRUE;
2574 	}
2575 #endif
2576 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2577 #if defined(ARM_LARGE_MEMORY)
2578 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2579 	if (region_select == 0) {
2580 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2581 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2582 		ret = TRUE;
2583 	}
2584 
2585 	if (region_select == 1) {
2586 		*startp = VREGION1_START;
2587 		*size = VREGION1_SIZE;
2588 		ret = TRUE;
2589 	}
2590 #else /* !defined(ARM_LARGE_MEMORY) */
2591 	unsigned long low_global_vr_mask = 0;
2592 	vm_map_size_t low_global_vr_size = 0;
2593 
2594 	if (region_select == 0) {
2595 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2596 		if (!TEST_PAGE_SIZE_4K) {
2597 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2598 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2599 		} else {
2600 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2601 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2602 		}
2603 		ret = TRUE;
2604 	}
2605 	if (region_select == 1) {
2606 		*startp = VREGION1_START;
2607 		*size = VREGION1_SIZE;
2608 		ret = TRUE;
2609 	}
2610 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2611 	if (!TEST_PAGE_SIZE_4K) {
2612 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2613 		low_global_vr_size = 0x2000000;
2614 	} else {
2615 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2616 		low_global_vr_size = 0x800000;
2617 	}
2618 
2619 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2620 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2621 		*size = low_global_vr_size;
2622 		ret = TRUE;
2623 	}
2624 
2625 	if (region_select == 3) {
2626 		/* In this config, we allow the bootstrap mappings to occupy the same
2627 		 * page table pages as the heap.
2628 		 */
2629 		*startp = VM_MIN_KERNEL_ADDRESS;
2630 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2631 		ret = TRUE;
2632 	}
2633 #endif /* defined(ARM_LARGE_MEMORY) */
2634 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2635 	return ret;
2636 }
2637 
2638 /*
2639  * Routines to track and allocate physical pages during early boot.
2640  * On most systems that memory runs from first_avail through to avail_end
2641  * with no gaps.
2642  *
2643  * If the system supports ECC and ecc_bad_pages_count > 0, we
2644  * need to skip those pages.
2645  */
2646 
2647 static unsigned int avail_page_count = 0;
2648 static bool need_ram_ranges_init = true;
2649 
2650 
2651 /**
2652  * Checks to see if a given page is in
2653  * the array of known bad pages
2654  *
2655  * @param ppn page number to check
2656  */
2657 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2658 pmap_is_bad_ram(__unused ppnum_t ppn)
2659 {
2660 	return false;
2661 }
2662 
2663 /**
2664  * Prepare bad ram pages to be skipped.
2665  */
2666 
2667 /*
2668  * Initialize the count of available pages. No lock needed here,
2669  * as this code is called while kernel boot up is single threaded.
2670  */
2671 static void
initialize_ram_ranges(void)2672 initialize_ram_ranges(void)
2673 {
2674 	pmap_paddr_t first = first_avail;
2675 	pmap_paddr_t end = avail_end;
2676 
2677 	assert(first <= end);
2678 	assert(first == (first & ~PAGE_MASK));
2679 	assert(end == (end & ~PAGE_MASK));
2680 	avail_page_count = atop(end - first);
2681 
2682 	need_ram_ranges_init = false;
2683 }
2684 
2685 unsigned int
pmap_free_pages(void)2686 pmap_free_pages(
2687 	void)
2688 {
2689 	if (need_ram_ranges_init) {
2690 		initialize_ram_ranges();
2691 	}
2692 	return avail_page_count;
2693 }
2694 
2695 unsigned int
pmap_free_pages_span(void)2696 pmap_free_pages_span(
2697 	void)
2698 {
2699 	if (need_ram_ranges_init) {
2700 		initialize_ram_ranges();
2701 	}
2702 	return (unsigned int)atop(avail_end - first_avail);
2703 }
2704 
2705 
2706 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2707 pmap_next_page_hi(
2708 	ppnum_t            * pnum,
2709 	__unused boolean_t might_free)
2710 {
2711 	return pmap_next_page(pnum);
2712 }
2713 
2714 
2715 boolean_t
pmap_next_page(ppnum_t * pnum)2716 pmap_next_page(
2717 	ppnum_t *pnum)
2718 {
2719 	if (need_ram_ranges_init) {
2720 		initialize_ram_ranges();
2721 	}
2722 
2723 
2724 	if (first_avail != avail_end) {
2725 		*pnum = (ppnum_t)atop(first_avail);
2726 		first_avail += PAGE_SIZE;
2727 		assert(avail_page_count > 0);
2728 		--avail_page_count;
2729 		return TRUE;
2730 	}
2731 	assert(avail_page_count == 0);
2732 	return FALSE;
2733 }
2734 
2735 
2736 /*
2737  *	Initialize the pmap module.
2738  *	Called by vm_init, to initialize any structures that the pmap
2739  *	system needs to map virtual memory.
2740  */
2741 void
pmap_init(void)2742 pmap_init(
2743 	void)
2744 {
2745 	/*
2746 	 *	Protect page zero in the kernel map.
2747 	 *	(can be overruled by permanent transltion
2748 	 *	table entries at page zero - see arm_vm_init).
2749 	 */
2750 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2751 
2752 	pmap_initialized = TRUE;
2753 
2754 	/*
2755 	 *	Create the zone of physical maps
2756 	 *	and the physical-to-virtual entries.
2757 	 */
2758 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2759 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2760 
2761 
2762 	/*
2763 	 *	Initialize the pmap object (for tracking the vm_page_t
2764 	 *	structures for pages we allocate to be page tables in
2765 	 *	pmap_expand().
2766 	 */
2767 	_vm_object_allocate(mem_size, pmap_object);
2768 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2769 
2770 	/*
2771 	 * The values of [hard_]maxproc may have been scaled, make sure
2772 	 * they are still less than the value of pmap_max_asids.
2773 	 */
2774 	if ((uint32_t)maxproc > pmap_max_asids) {
2775 		maxproc = pmap_max_asids;
2776 	}
2777 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2778 		hard_maxproc = pmap_max_asids;
2779 	}
2780 }
2781 
2782 /**
2783  * Verify that a given physical page contains no mappings (outside of the
2784  * default physical aperture mapping).
2785  *
2786  * @param ppnum Physical page number to check there are no mappings to.
2787  *
2788  * @return True if there are no mappings, false otherwise or if the page is not
2789  *         kernel-managed.
2790  */
2791 bool
pmap_verify_free(ppnum_t ppnum)2792 pmap_verify_free(ppnum_t ppnum)
2793 {
2794 	const pmap_paddr_t pa = ptoa(ppnum);
2795 
2796 	assert(pa != vm_page_fictitious_addr);
2797 
2798 	/* Only mappings to kernel-managed physical memory are tracked. */
2799 	if (!pa_valid(pa)) {
2800 		return false;
2801 	}
2802 
2803 	const unsigned int pai = pa_index(pa);
2804 	pv_entry_t **pvh = pai_to_pvh(pai);
2805 
2806 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2807 }
2808 
2809 #if MACH_ASSERT
2810 /**
2811  * Verify that a given physical page contains no mappings (outside of the
2812  * default physical aperture mapping) and if it does, then panic.
2813  *
2814  * @note It's recommended to use pmap_verify_free() directly when operating in
2815  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2816  *       normally being called from outside of the PPL, and the pv_head_table
2817  *       can't be modified outside of the PPL).
2818  *
2819  * @param ppnum Physical page number to check there are no mappings to.
2820  */
2821 void
pmap_assert_free(ppnum_t ppnum)2822 pmap_assert_free(ppnum_t ppnum)
2823 {
2824 	const pmap_paddr_t pa = ptoa(ppnum);
2825 
2826 	/* Only mappings to kernel-managed physical memory are tracked. */
2827 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2828 		return;
2829 	}
2830 
2831 	const unsigned int pai = pa_index(pa);
2832 	pv_entry_t **pvh = pai_to_pvh(pai);
2833 
2834 	/**
2835 	 * This function is always called from outside of the PPL. Because of this,
2836 	 * the PVH entry can't be locked. This function is generally only called
2837 	 * before the VM reclaims a physical page and shouldn't be creating new
2838 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2839 	 * the worst case is that the system will panic in another way, and we were
2840 	 * already about to panic anyway.
2841 	 */
2842 
2843 	/**
2844 	 * Since pmap_verify_free() returned false, that means there is at least one
2845 	 * mapping left. Let's get some extra info on the first mapping we find to
2846 	 * dump in the panic string (the common case is that there is one spare
2847 	 * mapping that was never unmapped).
2848 	 */
2849 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2850 
2851 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2852 		first_ptep = pvh_ptep(pvh);
2853 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2854 		pv_entry_t *pvep = pvh_pve_list(pvh);
2855 
2856 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2857 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2858 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2859 			if (first_ptep != PT_ENTRY_NULL) {
2860 				break;
2861 			}
2862 		}
2863 
2864 		/* The PVE should have at least one valid PTE. */
2865 		assert(first_ptep != PT_ENTRY_NULL);
2866 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2867 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2868 		    __func__, pvh, pai);
2869 	} else {
2870 		/**
2871 		 * The mapping disappeared between here and the pmap_verify_free() call.
2872 		 * The only way that can happen is if the VM was racing this call with
2873 		 * a call that unmaps PTEs. Operations on this page should not be
2874 		 * occurring at the same time as this check, and unfortunately we can't
2875 		 * lock the PVH entry to prevent it, so just panic instead.
2876 		 */
2877 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2878 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2879 		    __func__, pvh, pai);
2880 	}
2881 
2882 	/* Panic with a unique string identifying the first bad mapping and owner. */
2883 	{
2884 		/* First PTE is mapped by the main CPUs. */
2885 		pmap_t pmap = ptep_get_pmap(first_ptep);
2886 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2887 
2888 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2889 		    "%s CPU mapping (pmap: %p)",
2890 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2891 	}
2892 }
2893 #endif
2894 
2895 
2896 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2897 pmap_root_alloc_size(pmap_t pmap)
2898 {
2899 #pragma unused(pmap)
2900 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2901 	unsigned int root_level = pt_attr_root_level(pt_attr);
2902 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2903 }
2904 
2905 
2906 /*
2907  *	Create and return a physical map.
2908  *
2909  *	If the size specified for the map
2910  *	is zero, the map is an actual physical
2911  *	map, and may be referenced by the
2912  *	hardware.
2913  *
2914  *	If the size specified is non-zero,
2915  *	the map will be used in software only, and
2916  *	is bounded by that size.
2917  */
2918 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2919 pmap_create_options_internal(
2920 	ledger_t ledger,
2921 	vm_map_size_t size,
2922 	unsigned int flags,
2923 	kern_return_t *kr)
2924 {
2925 	unsigned        i;
2926 	unsigned        tte_index_max;
2927 	pmap_t          p;
2928 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2929 #if defined(HAS_APPLE_PAC)
2930 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2931 #endif /* defined(HAS_APPLE_PAC) */
2932 	kern_return_t   local_kr = KERN_SUCCESS;
2933 
2934 	if (size != 0) {
2935 		{
2936 			// Size parameter should only be set for stage 2.
2937 			return PMAP_NULL;
2938 		}
2939 	}
2940 
2941 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2942 		return PMAP_NULL;
2943 	}
2944 
2945 #if XNU_MONITOR
2946 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2947 		goto pmap_create_fail;
2948 	}
2949 
2950 	assert(p != PMAP_NULL);
2951 
2952 	if (ledger) {
2953 		pmap_ledger_validate(ledger);
2954 		pmap_ledger_retain(ledger);
2955 	}
2956 #else
2957 	/*
2958 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
2959 	 *	the translation table of the right size for the pmap.
2960 	 */
2961 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2962 		local_kr = KERN_RESOURCE_SHORTAGE;
2963 		goto pmap_create_fail;
2964 	}
2965 #endif
2966 
2967 	p->ledger = ledger;
2968 
2969 
2970 	p->pmap_vm_map_cs_enforced = false;
2971 	p->min = 0;
2972 
2973 
2974 #if CONFIG_ROSETTA
2975 	if (flags & PMAP_CREATE_ROSETTA) {
2976 		p->is_rosetta = TRUE;
2977 	} else {
2978 		p->is_rosetta = FALSE;
2979 	}
2980 #endif /* CONFIG_ROSETTA */
2981 
2982 #if defined(HAS_APPLE_PAC)
2983 	p->disable_jop = disable_jop;
2984 #endif /* defined(HAS_APPLE_PAC) */
2985 
2986 	p->nested_region_true_start = 0;
2987 	p->nested_region_true_end = ~0;
2988 
2989 	p->nx_enabled = true;
2990 	p->is_64bit = is_64bit;
2991 	p->nested_pmap = PMAP_NULL;
2992 	p->type = PMAP_TYPE_USER;
2993 
2994 #if ARM_PARAMETERIZED_PMAP
2995 	/* Default to the native pt_attr */
2996 	p->pmap_pt_attr = native_pt_attr;
2997 #endif /* ARM_PARAMETERIZED_PMAP */
2998 #if __ARM_MIXED_PAGE_SIZE__
2999 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3000 		p->pmap_pt_attr = &pmap_pt_attr_4k;
3001 	}
3002 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3003 	p->max = pmap_user_va_size(p);
3004 
3005 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3006 		local_kr = KERN_NO_SPACE;
3007 		goto id_alloc_fail;
3008 	}
3009 
3010 	pmap_lock_init(p);
3011 
3012 	p->tt_entry_free = (tt_entry_t *)0;
3013 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3014 
3015 
3016 #if XNU_MONITOR
3017 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3018 #else
3019 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3020 #endif
3021 	if (!(p->tte)) {
3022 		local_kr = KERN_RESOURCE_SHORTAGE;
3023 		goto tt1_alloc_fail;
3024 	}
3025 
3026 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3027 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3028 
3029 	/* nullify the translation table */
3030 	for (i = 0; i < tte_index_max; i++) {
3031 		p->tte[i] = ARM_TTE_TYPE_FAULT;
3032 	}
3033 
3034 	FLUSH_PTE();
3035 
3036 	/*
3037 	 *  initialize the rest of the structure
3038 	 */
3039 	p->nested_region_addr = 0x0ULL;
3040 	p->nested_region_size = 0x0ULL;
3041 	p->nested_region_unnested_table_bitmap = NULL;
3042 	p->nested_region_unnested_table_bitmap_size = 0x0UL;
3043 
3044 	p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3045 	p->nested_no_bounds_refcnt = 0;
3046 	p->nested_bounds_set = false;
3047 
3048 
3049 #if MACH_ASSERT
3050 	p->pmap_pid = 0;
3051 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3052 #endif /* MACH_ASSERT */
3053 #if DEVELOPMENT || DEBUG
3054 	p->footprint_was_suspended = FALSE;
3055 #endif /* DEVELOPMENT || DEBUG */
3056 
3057 #if XNU_MONITOR
3058 	os_atomic_init(&p->nested_count, 0);
3059 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3060 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3061 	os_atomic_thread_fence(release);
3062 #endif
3063 	os_atomic_init(&p->ref_count, 1);
3064 	pmap_simple_lock(&pmaps_lock);
3065 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3066 	pmap_simple_unlock(&pmaps_lock);
3067 
3068 	/*
3069 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3070 	 * which can lead to a concurrent disconnect operation making the balance
3071 	 * transiently negative.  The ledger should still ultimately balance out,
3072 	 * which we still check upon pmap destruction.
3073 	 */
3074 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3075 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3076 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3077 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3078 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3079 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3080 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3081 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3082 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3083 
3084 	return p;
3085 
3086 tt1_alloc_fail:
3087 	pmap_get_pt_ops(p)->free_id(p);
3088 id_alloc_fail:
3089 #if XNU_MONITOR
3090 	pmap_free_pmap(p);
3091 
3092 	if (ledger) {
3093 		pmap_ledger_release(ledger);
3094 	}
3095 #else
3096 	zfree(pmap_zone, p);
3097 #endif
3098 pmap_create_fail:
3099 #if XNU_MONITOR
3100 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3101 #endif
3102 	*kr = local_kr;
3103 #if XNU_MONITOR
3104 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3105 #endif
3106 	return PMAP_NULL;
3107 }
3108 
3109 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3110 pmap_create_options(
3111 	ledger_t ledger,
3112 	vm_map_size_t size,
3113 	unsigned int flags)
3114 {
3115 	pmap_t pmap;
3116 	kern_return_t kr = KERN_SUCCESS;
3117 
3118 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3119 
3120 	ledger_reference(ledger);
3121 
3122 #if XNU_MONITOR
3123 	for (;;) {
3124 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3125 		if (kr != KERN_RESOURCE_SHORTAGE) {
3126 			break;
3127 		}
3128 		assert(pmap == PMAP_NULL);
3129 		pmap_alloc_page_for_ppl(0);
3130 		kr = KERN_SUCCESS;
3131 	}
3132 #else
3133 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3134 #endif
3135 
3136 	if (pmap == PMAP_NULL) {
3137 		ledger_dereference(ledger);
3138 	}
3139 
3140 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3141 
3142 	return pmap;
3143 }
3144 
3145 #if XNU_MONITOR
3146 /*
3147  * This symbol remains in place when the PPL is enabled so that the dispatch
3148  * table does not change from development to release configurations.
3149  */
3150 #endif
3151 #if MACH_ASSERT || XNU_MONITOR
3152 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3153 pmap_set_process_internal(
3154 	__unused pmap_t pmap,
3155 	__unused int pid,
3156 	__unused char *procname)
3157 {
3158 #if MACH_ASSERT
3159 	if (pmap == NULL || pmap->pmap_pid == -1) {
3160 		return;
3161 	}
3162 
3163 	validate_pmap_mutable(pmap);
3164 
3165 	pmap->pmap_pid = pid;
3166 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3167 #endif /* MACH_ASSERT */
3168 }
3169 #endif /* MACH_ASSERT || XNU_MONITOR */
3170 
3171 #if MACH_ASSERT
3172 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3173 pmap_set_process(
3174 	pmap_t pmap,
3175 	int pid,
3176 	char *procname)
3177 {
3178 #if XNU_MONITOR
3179 	pmap_set_process_ppl(pmap, pid, procname);
3180 #else
3181 	pmap_set_process_internal(pmap, pid, procname);
3182 #endif
3183 }
3184 #endif /* MACH_ASSERT */
3185 
3186 /*
3187  * pmap_deallocate_all_leaf_tts:
3188  *
3189  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3190  * removing and deallocating all TTEs.
3191  */
3192 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3193 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3194 {
3195 	tt_entry_t tte = ARM_TTE_EMPTY;
3196 	tt_entry_t * ttep = NULL;
3197 	tt_entry_t * last_ttep = NULL;
3198 
3199 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3200 
3201 	assert(level < pt_attr_leaf_level(pt_attr));
3202 
3203 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3204 
3205 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3206 		tte = *ttep;
3207 
3208 		if (!(tte & ARM_TTE_VALID)) {
3209 			continue;
3210 		}
3211 
3212 		if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3213 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3214 			    "pmap=%p, first_ttep=%p, level=%u",
3215 			    __FUNCTION__, ttep, (void *)tte,
3216 			    pmap, first_ttep, level);
3217 		}
3218 
3219 		/* Must be valid, type table */
3220 		if (level < pt_attr_twig_level(pt_attr)) {
3221 			/* If we haven't reached the twig level, recurse to the next level. */
3222 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3223 		}
3224 
3225 		/* Remove the TTE. */
3226 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3227 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3228 	}
3229 }
3230 
3231 /*
3232  * We maintain stats and ledgers so that a task's physical footprint is:
3233  * phys_footprint = ((internal - alternate_accounting)
3234  *                   + (internal_compressed - alternate_accounting_compressed)
3235  *                   + iokit_mapped
3236  *                   + purgeable_nonvolatile
3237  *                   + purgeable_nonvolatile_compressed
3238  *                   + page_table)
3239  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3240  */
3241 
3242 /*
3243  *	Retire the given physical map from service.
3244  *	Should only be called if the map contains
3245  *	no valid mappings.
3246  */
3247 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3248 pmap_destroy_internal(
3249 	pmap_t pmap)
3250 {
3251 	if (pmap == PMAP_NULL) {
3252 		return;
3253 	}
3254 
3255 	validate_pmap(pmap);
3256 
3257 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3258 
3259 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3260 	if (ref_count > 0) {
3261 		return;
3262 	} else if (__improbable(ref_count < 0)) {
3263 		panic("pmap %p: refcount underflow", pmap);
3264 	} else if (__improbable(pmap == kernel_pmap)) {
3265 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3266 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3267 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3268 	}
3269 
3270 #if XNU_MONITOR
3271 	/*
3272 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3273 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3274 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3275 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3276 	 * ref_count of 0 and panic.
3277 	 */
3278 	os_atomic_thread_fence(seq_cst);
3279 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3280 		panic("pmap %p: attempt to destroy while nested", pmap);
3281 	}
3282 	const int max_cpu = ml_get_max_cpu_number();
3283 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3284 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3285 		if (cpu_data == NULL) {
3286 			continue;
3287 		}
3288 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3289 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3290 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3291 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3292 		}
3293 	}
3294 #endif
3295 	pmap_unmap_commpage(pmap);
3296 
3297 	pmap_simple_lock(&pmaps_lock);
3298 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3299 	pmap_simple_unlock(&pmaps_lock);
3300 
3301 	pmap_trim_self(pmap);
3302 
3303 	/*
3304 	 *	Free the memory maps, then the
3305 	 *	pmap structure.
3306 	 */
3307 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3308 
3309 
3310 
3311 	if (pmap->tte) {
3312 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3313 		pmap->tte = (tt_entry_t *) NULL;
3314 		pmap->ttep = 0;
3315 	}
3316 
3317 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3318 
3319 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3320 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3321 		sync_tlb_flush();
3322 	} else {
3323 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3324 		sync_tlb_flush();
3325 		/* return its asid to the pool */
3326 		pmap_get_pt_ops(pmap)->free_id(pmap);
3327 		if (pmap->nested_pmap != NULL) {
3328 #if XNU_MONITOR
3329 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3330 #endif
3331 			/* release the reference we hold on the nested pmap */
3332 			pmap_destroy_internal(pmap->nested_pmap);
3333 		}
3334 	}
3335 
3336 	pmap_check_ledgers(pmap);
3337 
3338 	if (pmap->nested_region_unnested_table_bitmap) {
3339 #if XNU_MONITOR
3340 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3341 #else
3342 		kfree_data(pmap->nested_region_unnested_table_bitmap,
3343 		    pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3344 #endif
3345 	}
3346 
3347 #if XNU_MONITOR
3348 	if (pmap->ledger) {
3349 		pmap_ledger_release(pmap->ledger);
3350 	}
3351 
3352 	pmap_lock_destroy(pmap);
3353 	pmap_free_pmap(pmap);
3354 #else
3355 	pmap_lock_destroy(pmap);
3356 	zfree(pmap_zone, pmap);
3357 #endif
3358 }
3359 
3360 void
pmap_destroy(pmap_t pmap)3361 pmap_destroy(
3362 	pmap_t pmap)
3363 {
3364 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3365 
3366 	ledger_t ledger = pmap->ledger;
3367 
3368 #if XNU_MONITOR
3369 	pmap_destroy_ppl(pmap);
3370 
3371 	pmap_ledger_check_balance(pmap);
3372 #else
3373 	pmap_destroy_internal(pmap);
3374 #endif
3375 
3376 	ledger_dereference(ledger);
3377 
3378 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3379 }
3380 
3381 
3382 /*
3383  *	Add a reference to the specified pmap.
3384  */
3385 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3386 pmap_reference_internal(
3387 	pmap_t pmap)
3388 {
3389 	if (pmap != PMAP_NULL) {
3390 		validate_pmap_mutable(pmap);
3391 		os_atomic_inc(&pmap->ref_count, relaxed);
3392 	}
3393 }
3394 
3395 void
pmap_reference(pmap_t pmap)3396 pmap_reference(
3397 	pmap_t pmap)
3398 {
3399 #if XNU_MONITOR
3400 	pmap_reference_ppl(pmap);
3401 #else
3402 	pmap_reference_internal(pmap);
3403 #endif
3404 }
3405 
3406 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3407 pmap_tt1_allocate(
3408 	pmap_t          pmap,
3409 	vm_size_t       size,
3410 	unsigned        option)
3411 {
3412 	tt_entry_t      *tt1 = NULL;
3413 	tt_free_entry_t *tt1_free;
3414 	pmap_paddr_t    pa;
3415 	vm_address_t    va;
3416 	vm_address_t    va_end;
3417 	kern_return_t   ret;
3418 
3419 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3420 		size = PAGE_SIZE;
3421 	}
3422 
3423 	pmap_simple_lock(&tt1_lock);
3424 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3425 		free_page_size_tt_count--;
3426 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3427 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3428 	} else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3429 		free_two_page_size_tt_count--;
3430 		tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3431 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3432 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3433 		free_tt_count--;
3434 		tt1 = (tt_entry_t *)free_tt_list;
3435 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3436 	}
3437 
3438 	pmap_simple_unlock(&tt1_lock);
3439 
3440 	if (tt1 != NULL) {
3441 		pmap_tt_ledger_credit(pmap, size);
3442 		return (tt_entry_t *)tt1;
3443 	}
3444 
3445 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3446 
3447 	if (ret == KERN_RESOURCE_SHORTAGE) {
3448 		return (tt_entry_t *)0;
3449 	}
3450 
3451 #if XNU_MONITOR
3452 	assert(pa);
3453 #endif
3454 
3455 	if (size < PAGE_SIZE) {
3456 		va = phystokv(pa) + size;
3457 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3458 		tt_free_entry_t *next_free = NULL;
3459 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3460 			tt1_free = (tt_free_entry_t *)va;
3461 			tt1_free->next = next_free;
3462 			next_free = tt1_free;
3463 		}
3464 		pmap_simple_lock(&tt1_lock);
3465 		local_free_list->next = free_tt_list;
3466 		free_tt_list = next_free;
3467 		free_tt_count += ((PAGE_SIZE / size) - 1);
3468 		if (free_tt_count > free_tt_max) {
3469 			free_tt_max = free_tt_count;
3470 		}
3471 		pmap_simple_unlock(&tt1_lock);
3472 	}
3473 
3474 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3475 	 * Depending on the device, this can vary between 512b and 16K. */
3476 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3477 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3478 	pmap_tt_ledger_credit(pmap, size);
3479 
3480 	return (tt_entry_t *) phystokv(pa);
3481 }
3482 
3483 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3484 pmap_tt1_deallocate(
3485 	pmap_t pmap,
3486 	tt_entry_t *tt,
3487 	vm_size_t size,
3488 	unsigned option)
3489 {
3490 	tt_free_entry_t *tt_entry;
3491 
3492 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3493 		size = PAGE_SIZE;
3494 	}
3495 
3496 	tt_entry = (tt_free_entry_t *)tt;
3497 	assert(not_in_kdp);
3498 	pmap_simple_lock(&tt1_lock);
3499 
3500 	if (size < PAGE_SIZE) {
3501 		free_tt_count++;
3502 		if (free_tt_count > free_tt_max) {
3503 			free_tt_max = free_tt_count;
3504 		}
3505 		tt_entry->next = free_tt_list;
3506 		free_tt_list = tt_entry;
3507 	}
3508 
3509 	if (size == PAGE_SIZE) {
3510 		free_page_size_tt_count++;
3511 		if (free_page_size_tt_count > free_page_size_tt_max) {
3512 			free_page_size_tt_max = free_page_size_tt_count;
3513 		}
3514 		tt_entry->next = free_page_size_tt_list;
3515 		free_page_size_tt_list = tt_entry;
3516 	}
3517 
3518 	if (size == 2 * PAGE_SIZE) {
3519 		free_two_page_size_tt_count++;
3520 		if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3521 			free_two_page_size_tt_max = free_two_page_size_tt_count;
3522 		}
3523 		tt_entry->next = free_two_page_size_tt_list;
3524 		free_two_page_size_tt_list = tt_entry;
3525 	}
3526 
3527 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3528 		pmap_simple_unlock(&tt1_lock);
3529 		pmap_tt_ledger_debit(pmap, size);
3530 		return;
3531 	}
3532 
3533 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3534 		free_page_size_tt_count--;
3535 		tt = (tt_entry_t *)free_page_size_tt_list;
3536 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3537 
3538 		pmap_simple_unlock(&tt1_lock);
3539 
3540 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3541 
3542 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3543 
3544 		pmap_simple_lock(&tt1_lock);
3545 	}
3546 
3547 	while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3548 		free_two_page_size_tt_count--;
3549 		tt = (tt_entry_t *)free_two_page_size_tt_list;
3550 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3551 
3552 		pmap_simple_unlock(&tt1_lock);
3553 
3554 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3555 
3556 		OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3557 
3558 		pmap_simple_lock(&tt1_lock);
3559 	}
3560 	pmap_simple_unlock(&tt1_lock);
3561 	pmap_tt_ledger_debit(pmap, size);
3562 }
3563 
3564 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3565 pmap_tt_allocate(
3566 	pmap_t pmap,
3567 	tt_entry_t **ttp,
3568 	unsigned int level,
3569 	unsigned int options)
3570 {
3571 	pmap_paddr_t pa;
3572 	*ttp = NULL;
3573 
3574 	/* Traverse the tt_entry_free list to find a free tt_entry */
3575 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3576 		return KERN_ABORTED;
3577 	}
3578 
3579 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3580 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3581 
3582 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3583 		tt_free_next = tt_free_cur->next;
3584 		tt_free_cur->next = NULL;
3585 		*ttp = (tt_entry_t *)tt_free_cur;
3586 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3587 	}
3588 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3589 
3590 	/* Only do the heavylifting here when we don't have a free tt_entry. */
3591 	if (*ttp == NULL) {
3592 		pt_desc_t       *ptdp;
3593 
3594 		/*
3595 		 *  Allocate a VM page for the level x page table entries.
3596 		 */
3597 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3598 			if (options & PMAP_OPTIONS_NOWAIT) {
3599 				return KERN_RESOURCE_SHORTAGE;
3600 			}
3601 			VM_PAGE_WAIT();
3602 		}
3603 
3604 		/* Allocate a new Page Table Descriptor for the newly allocated page table. */
3605 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3606 			if (options & PMAP_OPTIONS_NOWAIT) {
3607 				/* Deallocate all allocated resources so far. */
3608 				pmap_pages_free(pa, PAGE_SIZE);
3609 				return KERN_RESOURCE_SHORTAGE;
3610 			}
3611 			VM_PAGE_WAIT();
3612 		}
3613 
3614 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3615 			OSAddAtomic64(1, &alloc_ttepages_count);
3616 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3617 		} else {
3618 			OSAddAtomic64(1, &alloc_ptepages_count);
3619 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3620 		}
3621 
3622 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3623 
3624 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3625 
3626 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3627 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3628 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3629 
3630 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3631 		if (PAGE_SIZE > pmap_page_size) {
3632 			vm_address_t    va;
3633 			vm_address_t    va_end;
3634 
3635 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3636 				/* Deallocate all allocated resources so far. */
3637 				pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3638 				PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3639 				pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3640 				pmap_pages_free(pa, PAGE_SIZE);
3641 				ptd_deallocate(ptdp);
3642 
3643 				return KERN_ABORTED;
3644 			}
3645 
3646 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3647 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3648 				pmap->tt_entry_free = (tt_entry_t *)va;
3649 			}
3650 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3651 		}
3652 
3653 		*ttp = (tt_entry_t *)phystokv(pa);
3654 	}
3655 
3656 #if XNU_MONITOR
3657 	assert(*ttp);
3658 #endif
3659 
3660 	return KERN_SUCCESS;
3661 }
3662 
3663 
3664 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3665 pmap_tt_deallocate(
3666 	pmap_t pmap,
3667 	tt_entry_t *ttp,
3668 	unsigned int level)
3669 {
3670 	pt_desc_t *ptdp;
3671 	ptd_info_t *ptd_info;
3672 	unsigned pt_acc_cnt;
3673 	unsigned i;
3674 	vm_offset_t     free_page = 0;
3675 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3676 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3677 
3678 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3679 
3680 	ptdp = ptep_get_ptd(ttp);
3681 	ptd_info = ptd_get_info(ptdp, ttp);
3682 
3683 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3684 
3685 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3686 		ptd_info->refcnt = 0;
3687 	}
3688 
3689 	if (__improbable(ptd_info->refcnt != 0)) {
3690 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3691 	}
3692 
3693 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3694 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3695 	}
3696 
3697 	if (pt_acc_cnt == 0) {
3698 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3699 		unsigned pt_free_entry_cnt = 1;
3700 
3701 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3702 			tt_free_entry_t *tt_free_list_next;
3703 
3704 			tt_free_list_next = tt_free_list->next;
3705 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3706 				pt_free_entry_cnt++;
3707 			}
3708 			tt_free_list = tt_free_list_next;
3709 		}
3710 		if (pt_free_entry_cnt == max_pt_index) {
3711 			tt_free_entry_t *tt_free_list_cur;
3712 
3713 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3714 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3715 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3716 
3717 			while (tt_free_list_cur) {
3718 				tt_free_entry_t *tt_free_list_next;
3719 
3720 				tt_free_list_next = tt_free_list_cur->next;
3721 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3722 					tt_free_list->next = tt_free_list_next->next;
3723 				} else {
3724 					tt_free_list = tt_free_list_next;
3725 				}
3726 				tt_free_list_cur = tt_free_list_next;
3727 			}
3728 		} else {
3729 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3730 			pmap->tt_entry_free = ttp;
3731 		}
3732 	} else {
3733 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3734 		pmap->tt_entry_free = ttp;
3735 	}
3736 
3737 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3738 
3739 	if (free_page != 0) {
3740 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3741 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3742 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3743 		if (level < pt_attr_leaf_level(pt_attr)) {
3744 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3745 		} else {
3746 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3747 		}
3748 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3749 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3750 	}
3751 }
3752 
3753 /**
3754  * Safely clear out a translation table entry.
3755  *
3756  * @note If the TTE to clear out points to a leaf table, then that leaf table
3757  *       must have a refcnt of zero before the TTE can be removed.
3758  * @note This function expects to be called with pmap locked exclusive, and will
3759  *       return with pmap unlocked.
3760  *
3761  * @param pmap The pmap containing the page table whose TTE is being removed.
3762  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3763  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3764  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3765  * @param ttep Pointer to the TTE that should be cleared out.
3766  * @param level The level of the page table that contains the TTE to be removed.
3767  */
3768 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3769 pmap_tte_remove(
3770 	pmap_t pmap,
3771 	vm_offset_t va_start,
3772 	vm_offset_t va_end,
3773 	bool need_strong_sync,
3774 	tt_entry_t *ttep,
3775 	unsigned int level)
3776 {
3777 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3778 
3779 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3780 	const tt_entry_t tte = *ttep;
3781 
3782 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3783 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3784 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3785 	}
3786 
3787 	*ttep = (tt_entry_t) 0;
3788 	FLUSH_PTE_STRONG();
3789 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3790 	if (va_end > va_start) {
3791 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3792 	}
3793 
3794 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3795 
3796 	/**
3797 	 * Remember, the passed in "level" parameter refers to the level above the
3798 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3799 	 * page table).
3800 	 */
3801 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3802 
3803 	/**
3804 	 * Non-leaf pagetables don't track active references in the PTD and instead
3805 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3806 	 * the real refcount below.
3807 	 */
3808 	unsigned short refcnt = PT_DESC_REFCOUNT;
3809 
3810 	/*
3811 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3812 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3813 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3814 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3815 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3816 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3817 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3818 	 * synchronize it against the disconnect operation.  If that removal caused the
3819 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3820 	 * operation is finished using the relevant pagetable descriptor.
3821 	 * Address these cases by waiting until all CPUs have been observed to not be
3822 	 * executing pmap_disconnect().
3823 	 */
3824 	if (remove_leaf_table) {
3825 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3826 		const int max_cpu = ml_get_max_cpu_number();
3827 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3828 		bool inflight_disconnect;
3829 
3830 		/*
3831 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3832 		 * ahead of any prior PTE load which may have observed the effect of a
3833 		 * concurrent disconnect operation.  An acquire fence is required for this;
3834 		 * a load-acquire operation is insufficient.
3835 		 */
3836 		os_atomic_thread_fence(acquire);
3837 		do {
3838 			inflight_disconnect = false;
3839 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3840 			    i >= 0;
3841 			    i = bitmap_next(&active_disconnects[0], i)) {
3842 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3843 				if (cpu_data == NULL) {
3844 					continue;
3845 				}
3846 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3847 					__builtin_arm_wfe();
3848 					inflight_disconnect = true;
3849 					continue;
3850 				}
3851 				os_atomic_clear_exclusive();
3852 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3853 			}
3854 		} while (inflight_disconnect);
3855 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3856 		os_atomic_thread_fence(acquire);
3857 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3858 	}
3859 
3860 #if MACH_ASSERT
3861 	/**
3862 	 * On internal devices, always do the page table consistency check
3863 	 * regardless of page table level or the actual refcnt value.
3864 	 */
3865 	{
3866 #else /* MACH_ASSERT */
3867 	/**
3868 	 * Only perform the page table consistency check when deleting leaf page
3869 	 * tables and it seems like there might be valid/compressed mappings
3870 	 * leftover.
3871 	 */
3872 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3873 #endif /* MACH_ASSERT */
3874 
3875 		/**
3876 		 * There are multiple problems that can arise as a non-zero refcnt:
3877 		 * 1. A bug in the refcnt management logic.
3878 		 * 2. A memory stomper or hardware failure.
3879 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3880 		 *    space before destroying a pmap.
3881 		 *
3882 		 * By looping over the page table and determining how many valid or
3883 		 * compressed entries there actually are, we can narrow down which of
3884 		 * these three cases is causing this panic. If the expected refcnt
3885 		 * (valid + compressed) and the actual refcnt don't match then the
3886 		 * problem is probably either a memory corruption issue (if the
3887 		 * non-empty entries don't match valid+compressed, that could also be a
3888 		 * sign of corruption) or refcnt management bug. Otherwise, there
3889 		 * actually are leftover mappings and the higher layers of xnu are
3890 		 * probably at fault.
3891 		 */
3892 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3893 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3894 
3895 		pt_entry_t *ptep = bpte;
3896 		unsigned short non_empty = 0, valid = 0, comp = 0;
3897 
3898 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3899 			/**
3900 			 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3901 			 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3902 			 * That's because it's possible for the 4-tuple PTE clear operation in
3903 			 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3904 			 * pmap_disconnect() to race each other in such a way that the compressed marker
3905 			 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3906 			 * This should be harmless as only the 1st PTE is used for accounting purposes,
3907 			 * but we don't want it to trip our internal checks here.
3908 			 */
3909 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3910 				if ((i % PAGE_RATIO) == 0) {
3911 					comp++;
3912 				} else {
3913 					continue;
3914 				}
3915 			} else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3916 				valid++;
3917 			}
3918 
3919 			/* Keep track of all non-empty entries to detect memory corruption. */
3920 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3921 				non_empty++;
3922 			}
3923 		}
3924 
3925 #if MACH_ASSERT
3926 		/**
3927 		 * On internal machines, panic whenever a page table getting deleted has
3928 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3929 		 * non-zero refcnt.
3930 		 */
3931 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3932 #else /* MACH_ASSERT */
3933 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3934 		{
3935 #endif /* MACH_ASSERT */
3936 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3937 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3938 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3939 		}
3940 	}
3941 }
3942 
3943 /**
3944  * Given a pointer to an entry within a `level` page table, delete the
3945  * page table at `level` + 1 that is represented by that entry. For instance,
3946  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3947  * contains the PA of the L3 table, and `level` would be "2".
3948  *
3949  * @note If the table getting deallocated is a leaf table, then that leaf table
3950  *       must have a refcnt of zero before getting deallocated. All other levels
3951  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3952  * @note This function expects to be called with pmap locked exclusive and will
3953  *       return with pmap unlocked.
3954  *
3955  * @param pmap The pmap that owns the page table to be deallocated.
3956  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3957  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3958  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3959  * @param ttep Pointer to the `level` TTE to remove.
3960  * @param level The level of the table that contains an entry pointing to the
3961  *              table to be removed. The deallocated page table will be a
3962  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
3963  *              deleted).
3964  */
3965 void
3966 pmap_tte_deallocate(
3967 	pmap_t pmap,
3968 	vm_offset_t va_start,
3969 	vm_offset_t va_end,
3970 	bool need_strong_sync,
3971 	tt_entry_t *ttep,
3972 	unsigned int level)
3973 {
3974 	tt_entry_t tte;
3975 
3976 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3977 
3978 	tte = *ttep;
3979 
3980 	if (tte_get_ptd(tte)->pmap != pmap) {
3981 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3982 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3983 	}
3984 
3985 	assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3986 	    __func__, ttep, (unsigned long long)tte);
3987 
3988 	/* pmap_tte_remove() will drop the pmap lock */
3989 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3990 
3991 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3992 }
3993 
3994 /*
3995  *	Remove a range of hardware page-table entries.
3996  *	The entries given are the first (inclusive)
3997  *	and last (exclusive) entries for the VM pages.
3998  *	The virtual address is the va for the first pte.
3999  *
4000  *	The pmap must be locked.
4001  *	If the pmap is not the kernel pmap, the range must lie
4002  *	entirely within one pte-page.  This is NOT checked.
4003  *	Assumes that the pte-page exists.
4004  *
4005  *	Returns the number of PTE changed
4006  */
4007 MARK_AS_PMAP_TEXT static int
4008 pmap_remove_range(
4009 	pmap_t pmap,
4010 	vm_map_address_t va,
4011 	pt_entry_t *bpte,
4012 	pt_entry_t *epte)
4013 {
4014 	bool need_strong_sync = false;
4015 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4016 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
4017 	if (num_changed > 0) {
4018 		PMAP_UPDATE_TLBS(pmap, va,
4019 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4020 	}
4021 	return num_changed;
4022 }
4023 
4024 
4025 #ifdef PVH_FLAG_EXEC
4026 
4027 /*
4028  *	Update the access protection bits of the physical aperture mapping for a page.
4029  *	This is useful, for example, in guranteeing that a verified executable page
4030  *	has no writable mappings anywhere in the system, including the physical
4031  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
4032  *	synchronization overhead in cases where the call to this function is
4033  *	guaranteed to be followed by other TLB operations.
4034  */
4035 void
4036 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4037 {
4038 #if __ARM_PTE_PHYSMAP__
4039 	pvh_assert_locked(pai);
4040 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4041 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4042 
4043 	pt_entry_t tmplate = *pte_p;
4044 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4045 		return;
4046 	}
4047 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4048 	if (tmplate & ARM_PTE_HINT_MASK) {
4049 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4050 		    __func__, pte_p, (void *)kva, tmplate);
4051 	}
4052 	write_pte_strong(pte_p, tmplate);
4053 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4054 	if (!flush_tlb_async) {
4055 		sync_tlb_flush();
4056 	}
4057 #endif
4058 }
4059 #endif /* defined(PVH_FLAG_EXEC) */
4060 
4061 
4062 
4063 MARK_AS_PMAP_TEXT int
4064 pmap_remove_range_options(
4065 	pmap_t pmap,
4066 	vm_map_address_t va,
4067 	pt_entry_t *bpte,
4068 	pt_entry_t *epte,
4069 	vm_map_address_t *eva,
4070 	bool *need_strong_sync __unused,
4071 	int options)
4072 {
4073 	pt_entry_t     *cpte;
4074 	size_t          npages = 0;
4075 	int             num_removed, num_unwired;
4076 	int             num_pte_changed;
4077 	unsigned int    pai = 0;
4078 	pmap_paddr_t    pa;
4079 	int             num_external, num_internal, num_reusable;
4080 	int             num_alt_internal;
4081 	uint64_t        num_compressed, num_alt_compressed;
4082 	int16_t         refcnt = 0;
4083 
4084 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4085 
4086 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4087 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4088 
4089 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4090 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4091 	}
4092 
4093 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4094 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4095 	}
4096 
4097 	num_removed = 0;
4098 	num_unwired = 0;
4099 	num_pte_changed = 0;
4100 	num_external = 0;
4101 	num_internal = 0;
4102 	num_reusable = 0;
4103 	num_compressed = 0;
4104 	num_alt_internal = 0;
4105 	num_alt_compressed = 0;
4106 
4107 #if XNU_MONITOR
4108 	bool ro_va = false;
4109 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4110 		ro_va = true;
4111 	}
4112 #endif
4113 	for (cpte = bpte; cpte < epte;
4114 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4115 		pt_entry_t      spte;
4116 		boolean_t       managed = FALSE;
4117 
4118 		/*
4119 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4120 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4121 		 */
4122 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4123 			*eva = va;
4124 			break;
4125 		}
4126 
4127 		spte = *((volatile pt_entry_t*)cpte);
4128 
4129 		while (!managed) {
4130 			if (pmap != kernel_pmap &&
4131 			    (options & PMAP_OPTIONS_REMOVE) &&
4132 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4133 				/*
4134 				 * "pmap" must be locked at this point,
4135 				 * so this should not race with another
4136 				 * pmap_remove_range() or pmap_enter().
4137 				 */
4138 
4139 				/* one less "compressed"... */
4140 				num_compressed++;
4141 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4142 					/* ... but it used to be "ALTACCT" */
4143 					num_alt_compressed++;
4144 				}
4145 
4146 				/* clear marker */
4147 				write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4148 				/*
4149 				 * "refcnt" also accounts for
4150 				 * our "compressed" markers,
4151 				 * so let's update it here.
4152 				 */
4153 				--refcnt;
4154 				spte = *((volatile pt_entry_t*)cpte);
4155 			}
4156 			/*
4157 			 * It may be possible for the pte to transition from managed
4158 			 * to unmanaged in this timeframe; for now, elide the assert.
4159 			 * We should break out as a consequence of checking pa_valid.
4160 			 */
4161 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4162 			pa = pte_to_pa(spte);
4163 			if (!pa_valid(pa)) {
4164 #if XNU_MONITOR
4165 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4166 #endif
4167 #if XNU_MONITOR
4168 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4169 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4170 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4171 					    __func__, (uint64_t)pa);
4172 				}
4173 #endif
4174 				break;
4175 			}
4176 #if HAS_FEAT_XS
4177 			if (pte_is_xs(pt_attr, spte)) {
4178 				*need_strong_sync = true;
4179 			}
4180 #endif /* HAS_FEAT_XS */
4181 			pai = pa_index(pa);
4182 			pvh_lock(pai);
4183 			spte = *((volatile pt_entry_t*)cpte);
4184 			pa = pte_to_pa(spte);
4185 			if (pai == pa_index(pa)) {
4186 				managed = TRUE;
4187 				break; // Leave pai locked as we will unlock it after we free the PV entry
4188 			}
4189 			pvh_unlock(pai);
4190 		}
4191 
4192 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4193 			/*
4194 			 * There used to be a valid mapping here but it
4195 			 * has already been removed when the page was
4196 			 * sent to the VM compressor, so nothing left to
4197 			 * remove now...
4198 			 */
4199 			continue;
4200 		}
4201 
4202 		/* remove the translation, do not flush the TLB */
4203 		if (*cpte != ARM_PTE_TYPE_FAULT) {
4204 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4205 			assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4206 #if MACH_ASSERT
4207 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4208 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4209 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4210 			}
4211 #endif
4212 			write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4213 			num_pte_changed++;
4214 		}
4215 
4216 		if ((spte != ARM_PTE_TYPE_FAULT) &&
4217 		    (pmap != kernel_pmap)) {
4218 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4219 			assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4220 			--refcnt;
4221 		}
4222 
4223 		if (pte_is_wired(spte)) {
4224 			pte_set_wired(pmap, cpte, 0);
4225 			num_unwired++;
4226 		}
4227 		/*
4228 		 * if not managed, we're done
4229 		 */
4230 		if (!managed) {
4231 			continue;
4232 		}
4233 
4234 #if XNU_MONITOR
4235 		if (__improbable(ro_va)) {
4236 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4237 		}
4238 #endif
4239 
4240 		/*
4241 		 * find and remove the mapping from the chain for this
4242 		 * physical address.
4243 		 */
4244 		bool is_internal, is_altacct;
4245 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4246 
4247 		if (is_altacct) {
4248 			assert(is_internal);
4249 			num_internal++;
4250 			num_alt_internal++;
4251 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4252 				ppattr_clear_altacct(pai);
4253 				ppattr_clear_internal(pai);
4254 			}
4255 		} else if (is_internal) {
4256 			if (ppattr_test_reusable(pai)) {
4257 				num_reusable++;
4258 			} else {
4259 				num_internal++;
4260 			}
4261 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4262 				ppattr_clear_internal(pai);
4263 			}
4264 		} else {
4265 			num_external++;
4266 		}
4267 		pvh_unlock(pai);
4268 		num_removed++;
4269 	}
4270 
4271 	/*
4272 	 *	Update the counts
4273 	 */
4274 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4275 
4276 	if (pmap != kernel_pmap) {
4277 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4278 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4279 		}
4280 
4281 		/* update ledgers */
4282 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4283 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4284 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4285 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4286 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4287 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4288 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4289 		/* make needed adjustments to phys_footprint */
4290 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4291 		    ((num_internal -
4292 		    num_alt_internal) +
4293 		    (num_compressed -
4294 		    num_alt_compressed)) * pmap_page_size);
4295 	}
4296 
4297 	/* flush the ptable entries we have written */
4298 	if (num_pte_changed > 0) {
4299 		FLUSH_PTE_STRONG();
4300 	}
4301 
4302 	return num_pte_changed;
4303 }
4304 
4305 
4306 /*
4307  *	Remove the given range of addresses
4308  *	from the specified map.
4309  *
4310  *	It is assumed that the start and end are properly
4311  *	rounded to the hardware page size.
4312  */
4313 void
4314 pmap_remove(
4315 	pmap_t pmap,
4316 	vm_map_address_t start,
4317 	vm_map_address_t end)
4318 {
4319 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4320 }
4321 
4322 MARK_AS_PMAP_TEXT vm_map_address_t
4323 pmap_remove_options_internal(
4324 	pmap_t pmap,
4325 	vm_map_address_t start,
4326 	vm_map_address_t end,
4327 	int options)
4328 {
4329 	vm_map_address_t eva = end;
4330 	pt_entry_t     *bpte, *epte;
4331 	pt_entry_t     *pte_p;
4332 	tt_entry_t     *tte_p;
4333 	int             remove_count = 0;
4334 	bool            need_strong_sync = false;
4335 	bool            unlock = true;
4336 
4337 	validate_pmap_mutable(pmap);
4338 
4339 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4340 
4341 	if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4342 		panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4343 	}
4344 
4345 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4346 
4347 	tte_p = pmap_tte(pmap, start);
4348 
4349 	if (tte_p == (tt_entry_t *) NULL) {
4350 		goto done;
4351 	}
4352 
4353 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4354 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4355 		bpte = &pte_p[pte_index(pt_attr, start)];
4356 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4357 
4358 		/*
4359 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4360 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4361 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4362 		 */
4363 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4364 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4365 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4366 		}
4367 
4368 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4369 		    &need_strong_sync, options);
4370 
4371 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4372 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4373 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4374 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4375 		}
4376 	}
4377 
4378 done:
4379 	if (unlock) {
4380 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4381 	}
4382 
4383 	if (remove_count > 0) {
4384 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4385 	}
4386 	return eva;
4387 }
4388 
4389 void
4390 pmap_remove_options(
4391 	pmap_t pmap,
4392 	vm_map_address_t start,
4393 	vm_map_address_t end,
4394 	int options)
4395 {
4396 	vm_map_address_t va;
4397 
4398 	if (pmap == PMAP_NULL) {
4399 		return;
4400 	}
4401 
4402 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4403 
4404 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4405 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4406 	    VM_KERNEL_ADDRHIDE(end));
4407 
4408 	/*
4409 	 * We allow single-page requests to execute non-preemptibly,
4410 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4411 	 * operation, and there are a couple of special use cases that
4412 	 * require a non-preemptible single-page operation.
4413 	 */
4414 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4415 		pmap_verify_preemptible();
4416 	}
4417 
4418 	/*
4419 	 *      Invalidate the translation buffer first
4420 	 */
4421 	va = start;
4422 	while (va < end) {
4423 		vm_map_address_t l;
4424 
4425 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4426 		if (l > end) {
4427 			l = end;
4428 		}
4429 
4430 #if XNU_MONITOR
4431 		va = pmap_remove_options_ppl(pmap, va, l, options);
4432 
4433 		pmap_ledger_check_balance(pmap);
4434 #else
4435 		va = pmap_remove_options_internal(pmap, va, l, options);
4436 #endif
4437 	}
4438 
4439 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4440 }
4441 
4442 
4443 /*
4444  *	Remove phys addr if mapped in specified map
4445  */
4446 void
4447 pmap_remove_some_phys(
4448 	__unused pmap_t map,
4449 	__unused ppnum_t pn)
4450 {
4451 	/* Implement to support working set code */
4452 }
4453 
4454 /*
4455  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4456  * switch a thread onto a new vm_map.
4457  */
4458 void
4459 pmap_switch_user(thread_t thread, vm_map_t new_map)
4460 {
4461 	pmap_t new_pmap = new_map->pmap;
4462 
4463 
4464 	thread->map = new_map;
4465 	pmap_set_pmap(new_pmap, thread);
4466 
4467 }
4468 
4469 void
4470 pmap_set_pmap(
4471 	pmap_t pmap,
4472 #if     !__ARM_USER_PROTECT__
4473 	__unused
4474 #endif
4475 	thread_t        thread)
4476 {
4477 	pmap_switch(pmap);
4478 #if __ARM_USER_PROTECT__
4479 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4480 	thread->machine.asid = pmap->hw_asid;
4481 #endif
4482 }
4483 
4484 static void
4485 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4486 {
4487 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4488 }
4489 
4490 static inline bool
4491 pmap_user_ttb_is_clear(void)
4492 {
4493 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4494 }
4495 
4496 MARK_AS_PMAP_TEXT void
4497 pmap_switch_internal(
4498 	pmap_t pmap)
4499 {
4500 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4501 #if XNU_MONITOR
4502 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4503 #endif
4504 	validate_pmap_mutable(pmap);
4505 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4506 	uint16_t asid_index = pmap->hw_asid;
4507 	bool do_asid_flush = false;
4508 	bool do_commpage_flush = false;
4509 
4510 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4511 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4512 	}
4513 #if __ARM_KERNEL_PROTECT__
4514 	asid_index >>= 1;
4515 #endif
4516 
4517 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4518 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4519 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4520 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4521 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4522 	bool break_before_make = do_shared_region_flush;
4523 
4524 #if !HAS_16BIT_ASID
4525 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4526 		asid_index -= 1;
4527 		pmap_update_plru(asid_index);
4528 
4529 		/* Paranoia. */
4530 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4531 
4532 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4533 		uint8_t new_sw_asid = pmap->sw_asid;
4534 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4535 
4536 		if (new_sw_asid != last_sw_asid) {
4537 			/*
4538 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4539 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4540 			 * then this switch runs the risk of aliasing.  We need to flush the
4541 			 * TLB for this phyiscal ASID in this case.
4542 			 */
4543 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4544 			do_asid_flush = true;
4545 			break_before_make = true;
4546 		}
4547 	}
4548 #endif /* !HAS_16BIT_ASID */
4549 
4550 #if __ARM_MIXED_PAGE_SIZE__
4551 	if (pt_attr->pta_tcr_value != get_tcr()) {
4552 		break_before_make = true;
4553 	}
4554 #endif
4555 #if __ARM_MIXED_PAGE_SIZE__
4556 	/*
4557 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4558 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4559 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4560 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4561 	 * conflict abort or other unpredictable behavior.
4562 	 */
4563 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4564 		do_commpage_flush = true;
4565 	}
4566 	if (do_commpage_flush) {
4567 		break_before_make = true;
4568 	}
4569 #endif
4570 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4571 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4572 		pmap_clear_user_ttb_internal();
4573 	}
4574 
4575 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4576 	 * to flush the userspace mappings for that region.  Those mappings are global
4577 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4578 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4579 	if (__improbable(do_shared_region_flush)) {
4580 #if __ARM_RANGE_TLBI__
4581 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4582 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4583 
4584 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4585 		 * There may still be non-global entries that overlap with the incoming pmap's
4586 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4587 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4588 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4589 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4590 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4591 		 * to consider additional invalidation here in the future. */
4592 		if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4593 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4594 		} else {
4595 			/*
4596 			 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4597 			 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4598 			 * have a single-page shared region anyway, not least because pmap_nest()
4599 			 * requires L2 block alignment of the address and size.
4600 			 */
4601 			do_asid_flush = false;
4602 			flush_core_tlb_async();
4603 		}
4604 #else
4605 		do_asid_flush = false;
4606 		flush_core_tlb_async();
4607 #endif // __ARM_RANGE_TLBI__
4608 	}
4609 
4610 #if __ARM_MIXED_PAGE_SIZE__
4611 	if (__improbable(do_commpage_flush)) {
4612 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4613 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4614 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4615 		flush_core_tlb_allrange_async(rtlbi_param);
4616 	}
4617 #endif
4618 	if (__improbable(do_asid_flush)) {
4619 		pmap_flush_core_tlb_asid_async(pmap);
4620 #if DEVELOPMENT || DEBUG
4621 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4622 #endif
4623 	}
4624 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4625 		sync_tlb_flush_local();
4626 	}
4627 
4628 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4629 }
4630 
4631 void
4632 pmap_switch(
4633 	pmap_t pmap)
4634 {
4635 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4636 #if XNU_MONITOR
4637 	pmap_switch_ppl(pmap);
4638 #else
4639 	pmap_switch_internal(pmap);
4640 #endif
4641 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4642 }
4643 
4644 void
4645 pmap_page_protect(
4646 	ppnum_t ppnum,
4647 	vm_prot_t prot)
4648 {
4649 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4650 }
4651 
4652 /*
4653  *	Routine:	pmap_page_protect_options
4654  *
4655  *	Function:
4656  *		Lower the permission for all mappings to a given
4657  *		page.
4658  */
4659 MARK_AS_PMAP_TEXT static void
4660 pmap_page_protect_options_with_flush_range(
4661 	ppnum_t ppnum,
4662 	vm_prot_t prot,
4663 	unsigned int options,
4664 	pmap_tlb_flush_range_t *flush_range)
4665 {
4666 	pmap_paddr_t    phys = ptoa(ppnum);
4667 	pv_entry_t    **pv_h;
4668 	pv_entry_t     *pve_p, *orig_pve_p;
4669 	pv_entry_t     *pveh_p;
4670 	pv_entry_t     *pvet_p;
4671 	pt_entry_t     *pte_p, *orig_pte_p;
4672 	pv_entry_t     *new_pve_p;
4673 	pt_entry_t     *new_pte_p;
4674 	vm_offset_t     pvh_flags;
4675 	unsigned int    pai;
4676 	bool            remove;
4677 	bool            set_NX;
4678 	unsigned int    pvh_cnt = 0;
4679 	unsigned int    pass1_updated = 0;
4680 	unsigned int    pass2_updated = 0;
4681 
4682 	assert(ppnum != vm_page_fictitious_addr);
4683 
4684 	/* Only work with managed pages. */
4685 	if (!pa_valid(phys)) {
4686 		return;
4687 	}
4688 
4689 	/*
4690 	 * Determine the new protection.
4691 	 */
4692 	switch (prot) {
4693 	case VM_PROT_ALL:
4694 		return;         /* nothing to do */
4695 	case VM_PROT_READ:
4696 	case VM_PROT_READ | VM_PROT_EXECUTE:
4697 		remove = false;
4698 		break;
4699 	default:
4700 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4701 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4702 		remove = true;
4703 		break;
4704 	}
4705 
4706 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4707 	if (remove) {
4708 #if !XNU_MONITOR
4709 		mp_disable_preemption();
4710 #endif
4711 		pmap_cpu_data = pmap_get_cpu_data();
4712 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4713 		/*
4714 		 * Ensure the store to inflight_disconnect will be observed before any of the
4715 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4716 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4717 		 * another CPU, in between this function's clearing a PTE and dropping the
4718 		 * corresponding pagetable refcount.  That can lead to a panic if the
4719 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4720 		 * store barrier; a store-release operation would not be sufficient.
4721 		 */
4722 		os_atomic_thread_fence(release);
4723 	}
4724 
4725 	pai = pa_index(phys);
4726 	pvh_lock(pai);
4727 	pv_h = pai_to_pvh(pai);
4728 	pvh_flags = pvh_get_flags(pv_h);
4729 
4730 #if XNU_MONITOR
4731 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4732 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4733 	}
4734 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4735 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4736 	}
4737 #endif
4738 
4739 
4740 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4741 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4742 	pveh_p = PV_ENTRY_NULL;
4743 	pvet_p = PV_ENTRY_NULL;
4744 	new_pve_p = PV_ENTRY_NULL;
4745 	new_pte_p = PT_ENTRY_NULL;
4746 
4747 
4748 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4749 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4750 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4751 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4752 		pveh_p = pve_p;
4753 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4754 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4755 	}
4756 
4757 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4758 	int pve_ptep_idx = 0;
4759 
4760 	/*
4761 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4762 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4763 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4764 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4765 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4766 	 * tlb_flush_needed to be true while issue_tlbi is false.
4767 	 */
4768 	bool issue_tlbi = false;
4769 	bool tlb_flush_needed = false;
4770 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4771 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4772 		pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4773 		bool update = false;
4774 
4775 		if (pve_p != PV_ENTRY_NULL) {
4776 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4777 			if (pte_p == PT_ENTRY_NULL) {
4778 				goto protect_skip_pve_pass1;
4779 			}
4780 		}
4781 
4782 #ifdef PVH_FLAG_IOMMU
4783 		if (pvh_ptep_is_iommu(pte_p)) {
4784 #if XNU_MONITOR
4785 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4786 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4787 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4788 			}
4789 #endif
4790 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4791 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4792 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4793 			}
4794 			goto protect_skip_pve_pass1;
4795 		}
4796 #endif
4797 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4798 		const pmap_t pmap = ptdp->pmap;
4799 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4800 
4801 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4802 #if MACH_ASSERT
4803 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4804 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4805 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4806 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4807 
4808 				pv_entry_t *check_pvep = pve_p;
4809 
4810 				do {
4811 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4812 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4813 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4814 					}
4815 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4816 
4817 				/* Restore previous PTEP value. */
4818 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4819 			}
4820 #endif
4821 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4822 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4823 		}
4824 
4825 #if DEVELOPMENT || DEBUG
4826 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4827 #else
4828 		if ((prot & VM_PROT_EXECUTE))
4829 #endif
4830 		{
4831 			set_NX = false;
4832 		} else {
4833 			set_NX = true;
4834 		}
4835 
4836 #if HAS_FEAT_XS
4837 		/**
4838 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4839 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4840 		 */
4841 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4842 #endif /* HAS_FEAT_XS */
4843 
4844 		/* Remove the mapping if new protection is NONE */
4845 		if (remove) {
4846 			if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4847 				panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
4848 				    __func__, pmap, ppnum);
4849 			}
4850 
4851 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4852 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4853 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4854 			pt_entry_t spte = *pte_p;
4855 
4856 			if (pte_is_wired(spte)) {
4857 				pte_set_wired(pmap, pte_p, 0);
4858 				spte = *pte_p;
4859 				if (pmap != kernel_pmap) {
4860 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4861 				}
4862 			}
4863 
4864 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4865 			    (uint64_t)spte, pte_p, ppnum);
4866 
4867 			if (compress && is_internal && (pmap != kernel_pmap)) {
4868 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4869 				/* mark this PTE as having been "compressed" */
4870 				tmplate = ARM_PTE_COMPRESSED;
4871 				if (is_altacct) {
4872 					tmplate |= ARM_PTE_COMPRESSED_ALT;
4873 				}
4874 			} else {
4875 				tmplate = ARM_PTE_TYPE_FAULT;
4876 			}
4877 
4878 			assert(spte != tmplate);
4879 			write_pte_fast(pte_p, tmplate);
4880 			update = true;
4881 			++pass1_updated;
4882 
4883 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4884 
4885 			if (pmap != kernel_pmap) {
4886 				if (ppattr_test_reusable(pai) &&
4887 				    is_internal &&
4888 				    !is_altacct) {
4889 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4890 				} else if (!is_internal) {
4891 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4892 				}
4893 
4894 				if (is_altacct) {
4895 					assert(is_internal);
4896 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4897 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4898 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4899 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4900 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4901 					}
4902 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4903 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4904 				} else if (ppattr_test_reusable(pai)) {
4905 					assert(is_internal);
4906 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4907 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4908 						/* was not in footprint, but is now */
4909 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4910 					}
4911 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4912 				} else if (is_internal) {
4913 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4914 
4915 					/*
4916 					 * Update all stats related to physical footprint, which only
4917 					 * deals with internal pages.
4918 					 */
4919 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4920 						/*
4921 						 * This removal is only being done so we can send this page to
4922 						 * the compressor; therefore it mustn't affect total task footprint.
4923 						 */
4924 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4925 					} else {
4926 						/*
4927 						 * This internal page isn't going to the compressor, so adjust stats to keep
4928 						 * phys_footprint up to date.
4929 						 */
4930 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4931 					}
4932 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4933 				} else {
4934 					/* external page: no impact on ledgers */
4935 				}
4936 			}
4937 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4938 		} else {
4939 			pt_entry_t spte = *pte_p;
4940 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4941 
4942 			if (pmap == kernel_pmap) {
4943 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4944 			} else {
4945 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4946 			}
4947 
4948 			/*
4949 			 * While the naive implementation of this would serve to add execute
4950 			 * permission, this is not how the VM uses this interface, or how
4951 			 * x86_64 implements it.  So ignore requests to add execute permissions.
4952 			 */
4953 			if (set_NX) {
4954 				tmplate |= pt_attr_leaf_xn(pt_attr);
4955 			}
4956 
4957 
4958 			assert(spte != ARM_PTE_TYPE_FAULT);
4959 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4960 
4961 			if (spte != tmplate) {
4962 				/*
4963 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4964 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4965 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
4966 				 * should always be cleared by this function.
4967 				 */
4968 				pte_set_was_writeable(tmplate, true);
4969 				write_pte_fast(pte_p, tmplate);
4970 				update = true;
4971 				++pass1_updated;
4972 			} else if (pte_was_writeable(tmplate)) {
4973 				/*
4974 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4975 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
4976 				 * write access to a page, this function should always at least clear that flag for
4977 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4978 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
4979 				 * be handled through arm_fast_fault().
4980 				 */
4981 				pte_set_was_writeable(tmplate, false);
4982 				write_pte_fast(pte_p, tmplate);
4983 			}
4984 		}
4985 
4986 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
4987 			tlb_flush_needed = true;
4988 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4989 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4990 				issue_tlbi = true;
4991 			}
4992 		}
4993 protect_skip_pve_pass1:
4994 		pte_p = PT_ENTRY_NULL;
4995 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4996 			pve_ptep_idx = 0;
4997 			pve_p = pve_next(pve_p);
4998 		}
4999 	}
5000 
5001 	if (tlb_flush_needed) {
5002 		FLUSH_PTE_STRONG();
5003 	}
5004 
5005 	if (!remove && !issue_tlbi) {
5006 		goto protect_finish;
5007 	}
5008 
5009 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5010 	pv_entry_t **pve_pp = pv_h;
5011 	pve_p = orig_pve_p;
5012 	pte_p = orig_pte_p;
5013 	pve_ptep_idx = 0;
5014 
5015 	/*
5016 	 * We need to keep track of whether a particular PVE list contains IOMMU
5017 	 * mappings when removing entries, because we should only remove CPU
5018 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5019 	 * it around.
5020 	 */
5021 	bool iommu_mapping_in_pve = false;
5022 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5023 		if (pve_p != PV_ENTRY_NULL) {
5024 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5025 			if (pte_p == PT_ENTRY_NULL) {
5026 				goto protect_skip_pve_pass2;
5027 			}
5028 		}
5029 
5030 #ifdef PVH_FLAG_IOMMU
5031 		if (pvh_ptep_is_iommu(pte_p)) {
5032 			iommu_mapping_in_pve = true;
5033 			if (remove && (pve_p == PV_ENTRY_NULL)) {
5034 				/*
5035 				 * We've found an IOMMU entry and it's the only entry in the PV list.
5036 				 * We don't discard IOMMU entries, so simply set up the new PV list to
5037 				 * contain the single IOMMU PTE and exit the loop.
5038 				 */
5039 				new_pte_p = pte_p;
5040 				break;
5041 			}
5042 			goto protect_skip_pve_pass2;
5043 		}
5044 #endif
5045 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5046 		const pmap_t pmap = ptdp->pmap;
5047 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5048 
5049 		if (remove) {
5050 			if (!compress && (pmap != kernel_pmap)) {
5051 				/*
5052 				 * We must wait to decrement the refcount until we're completely finished using the PTE
5053 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
5054 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5055 				 * under us.
5056 				 */
5057 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5058 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5059 				}
5060 			}
5061 			/* Remove this CPU mapping from PVE list. */
5062 			if (pve_p != PV_ENTRY_NULL) {
5063 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5064 			}
5065 		} else {
5066 			pt_entry_t spte = *pte_p;
5067 			if (pte_was_writeable(spte)) {
5068 				pte_set_was_writeable(spte, false);
5069 				write_pte_fast(pte_p, spte);
5070 			} else {
5071 				goto protect_skip_pve_pass2;
5072 			}
5073 		}
5074 		++pass2_updated;
5075 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5076 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5077 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5078 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5079 		}
5080 
5081 protect_skip_pve_pass2:
5082 		pte_p = PT_ENTRY_NULL;
5083 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5084 			pve_ptep_idx = 0;
5085 
5086 			if (remove) {
5087 				/**
5088 				 * If there are any IOMMU mappings in the PVE list, preserve
5089 				 * those mappings in a new PVE list (new_pve_p) which will later
5090 				 * become the new PVH entry. Keep track of the CPU mappings in
5091 				 * pveh_p/pvet_p so they can be deallocated later.
5092 				 */
5093 				if (iommu_mapping_in_pve) {
5094 					iommu_mapping_in_pve = false;
5095 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5096 					pve_remove(pv_h, pve_pp, pve_p);
5097 					pveh_p = pvh_pve_list(pv_h);
5098 					pve_p->pve_next = new_pve_p;
5099 					new_pve_p = pve_p;
5100 					pve_p = temp_pve_p;
5101 					continue;
5102 				} else {
5103 					pvet_p = pve_p;
5104 					pvh_cnt++;
5105 				}
5106 			}
5107 
5108 			pve_pp = pve_next_ptr(pve_p);
5109 			pve_p = pve_next(pve_p);
5110 			iommu_mapping_in_pve = false;
5111 		}
5112 	}
5113 
5114 protect_finish:
5115 
5116 #ifdef PVH_FLAG_EXEC
5117 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5118 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5119 	}
5120 #endif
5121 	if (__improbable(pass1_updated != pass2_updated)) {
5122 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5123 		    __func__, pass1_updated, pass2_updated);
5124 	}
5125 	/* if we removed a bunch of entries, take care of them now */
5126 	if (remove) {
5127 		if (new_pve_p != PV_ENTRY_NULL) {
5128 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5129 			pvh_set_flags(pv_h, pvh_flags);
5130 		} else if (new_pte_p != PT_ENTRY_NULL) {
5131 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5132 			pvh_set_flags(pv_h, pvh_flags);
5133 		} else {
5134 			if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5135 				pmap_flush_noncoherent_page(phys);
5136 			}
5137 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5138 		}
5139 	}
5140 
5141 	if (flush_range && tlb_flush_needed) {
5142 		if (!remove) {
5143 			flush_range->ptfr_flush_needed = true;
5144 			tlb_flush_needed = false;
5145 		}
5146 	}
5147 
5148 	/*
5149 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5150 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5151 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5152 	 * a page to be repurposed while it is still live in the TLBs.
5153 	 */
5154 	if (remove && tlb_flush_needed) {
5155 		sync_tlb_flush();
5156 	}
5157 
5158 
5159 	pvh_unlock(pai);
5160 
5161 	if (remove) {
5162 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5163 #if !XNU_MONITOR
5164 		mp_enable_preemption();
5165 #endif
5166 	}
5167 
5168 	if (!remove && tlb_flush_needed) {
5169 		sync_tlb_flush();
5170 	}
5171 
5172 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5173 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5174 	}
5175 }
5176 
5177 MARK_AS_PMAP_TEXT void
5178 pmap_page_protect_options_internal(
5179 	ppnum_t ppnum,
5180 	vm_prot_t prot,
5181 	unsigned int options,
5182 	void *arg)
5183 {
5184 	if (arg != NULL) {
5185 		/*
5186 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5187 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5188 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5189 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5190 		 * In that case, force the flush to take place.
5191 		 */
5192 		options &= ~PMAP_OPTIONS_NOFLUSH;
5193 	}
5194 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5195 }
5196 
5197 void
5198 pmap_page_protect_options(
5199 	ppnum_t ppnum,
5200 	vm_prot_t prot,
5201 	unsigned int options,
5202 	void *arg)
5203 {
5204 	pmap_paddr_t    phys = ptoa(ppnum);
5205 
5206 	assert(ppnum != vm_page_fictitious_addr);
5207 
5208 	/* Only work with managed pages. */
5209 	if (!pa_valid(phys)) {
5210 		return;
5211 	}
5212 
5213 	/*
5214 	 * Determine the new protection.
5215 	 */
5216 	if (prot == VM_PROT_ALL) {
5217 		return;         /* nothing to do */
5218 	}
5219 
5220 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5221 
5222 #if XNU_MONITOR
5223 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5224 #else
5225 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5226 #endif
5227 
5228 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5229 }
5230 
5231 
5232 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5233 MARK_AS_PMAP_TEXT void
5234 pmap_disable_user_jop_internal(pmap_t pmap)
5235 {
5236 	if (pmap == kernel_pmap) {
5237 		panic("%s: called with kernel_pmap", __func__);
5238 	}
5239 	validate_pmap_mutable(pmap);
5240 	pmap->disable_jop = true;
5241 }
5242 
5243 void
5244 pmap_disable_user_jop(pmap_t pmap)
5245 {
5246 #if XNU_MONITOR
5247 	pmap_disable_user_jop_ppl(pmap);
5248 #else
5249 	pmap_disable_user_jop_internal(pmap);
5250 #endif
5251 }
5252 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5253 
5254 /*
5255  * Indicates if the pmap layer enforces some additional restrictions on the
5256  * given set of protections.
5257  */
5258 bool
5259 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5260 {
5261 	return false;
5262 }
5263 
5264 /*
5265  *	Set the physical protection on the
5266  *	specified range of this map as requested.
5267  *	VERY IMPORTANT: Will not increase permissions.
5268  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5269  */
5270 void
5271 pmap_protect(
5272 	pmap_t pmap,
5273 	vm_map_address_t b,
5274 	vm_map_address_t e,
5275 	vm_prot_t prot)
5276 {
5277 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5278 }
5279 
5280 MARK_AS_PMAP_TEXT vm_map_address_t
5281 pmap_protect_options_internal(
5282 	pmap_t pmap,
5283 	vm_map_address_t start,
5284 	vm_map_address_t end,
5285 	vm_prot_t prot,
5286 	unsigned int options,
5287 	__unused void *args)
5288 {
5289 	tt_entry_t      *tte_p;
5290 	pt_entry_t      *bpte_p, *epte_p;
5291 	pt_entry_t      *pte_p;
5292 	boolean_t        set_NX = TRUE;
5293 	boolean_t        set_XO = FALSE;
5294 	boolean_t        should_have_removed = FALSE;
5295 	bool             need_strong_sync = false;
5296 
5297 	/* Validate the pmap input before accessing its data. */
5298 	validate_pmap_mutable(pmap);
5299 
5300 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5301 
5302 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5303 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5304 	}
5305 
5306 #if DEVELOPMENT || DEBUG
5307 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5308 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5309 			should_have_removed = TRUE;
5310 		}
5311 	} else
5312 #endif
5313 	{
5314 		/* Determine the new protection. */
5315 		switch (prot) {
5316 		case VM_PROT_EXECUTE:
5317 			set_XO = TRUE;
5318 			OS_FALLTHROUGH;
5319 		case VM_PROT_READ:
5320 		case VM_PROT_READ | VM_PROT_EXECUTE:
5321 			break;
5322 		case VM_PROT_READ | VM_PROT_WRITE:
5323 		case VM_PROT_ALL:
5324 			return end;         /* nothing to do */
5325 		default:
5326 			should_have_removed = TRUE;
5327 		}
5328 	}
5329 
5330 	if (should_have_removed) {
5331 		panic("%s: should have been a remove operation, "
5332 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5333 		    __FUNCTION__,
5334 		    pmap, (void *)start, (void *)end, prot, options, args);
5335 	}
5336 
5337 #if DEVELOPMENT || DEBUG
5338 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5339 #else
5340 	if ((prot & VM_PROT_EXECUTE))
5341 #endif
5342 	{
5343 		set_NX = FALSE;
5344 	} else {
5345 		set_NX = TRUE;
5346 	}
5347 
5348 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5349 	vm_map_address_t va = start;
5350 	unsigned int npages = 0;
5351 
5352 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5353 
5354 	tte_p = pmap_tte(pmap, start);
5355 
5356 	if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5357 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5358 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5359 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5360 		pte_p = bpte_p;
5361 
5362 		for (pte_p = bpte_p;
5363 		    pte_p < epte_p;
5364 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5365 			++npages;
5366 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5367 			    pmap_pending_preemption())) {
5368 				break;
5369 			}
5370 			pt_entry_t spte;
5371 #if DEVELOPMENT || DEBUG
5372 			boolean_t  force_write = FALSE;
5373 #endif
5374 
5375 			spte = *((volatile pt_entry_t*)pte_p);
5376 
5377 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5378 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5379 				continue;
5380 			}
5381 
5382 			pmap_paddr_t    pa;
5383 			unsigned int    pai = 0;
5384 			boolean_t       managed = FALSE;
5385 
5386 			while (!managed) {
5387 				/*
5388 				 * It may be possible for the pte to transition from managed
5389 				 * to unmanaged in this timeframe; for now, elide the assert.
5390 				 * We should break out as a consequence of checking pa_valid.
5391 				 */
5392 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5393 				pa = pte_to_pa(spte);
5394 				if (!pa_valid(pa)) {
5395 					break;
5396 				}
5397 				pai = pa_index(pa);
5398 				pvh_lock(pai);
5399 				spte = *((volatile pt_entry_t*)pte_p);
5400 				pa = pte_to_pa(spte);
5401 				if (pai == pa_index(pa)) {
5402 					managed = TRUE;
5403 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5404 				}
5405 				pvh_unlock(pai);
5406 			}
5407 
5408 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5409 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5410 				continue;
5411 			}
5412 
5413 			pt_entry_t      tmplate;
5414 
5415 			if (pmap == kernel_pmap) {
5416 #if DEVELOPMENT || DEBUG
5417 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5418 					force_write = TRUE;
5419 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5420 				} else
5421 #endif
5422 				{
5423 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5424 				}
5425 			} else {
5426 #if DEVELOPMENT || DEBUG
5427 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5428 					assert(pmap->type != PMAP_TYPE_NESTED);
5429 					force_write = TRUE;
5430 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5431 				} else
5432 #endif
5433 				{
5434 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5435 				}
5436 			}
5437 
5438 			/*
5439 			 * XXX Removing "NX" would
5440 			 * grant "execute" access
5441 			 * immediately, bypassing any
5442 			 * checks VM might want to do
5443 			 * in its soft fault path.
5444 			 * pmap_protect() and co. are
5445 			 * not allowed to increase
5446 			 * access permissions.
5447 			 */
5448 			if (set_NX) {
5449 				tmplate |= pt_attr_leaf_xn(pt_attr);
5450 			} else {
5451 				if (pmap == kernel_pmap) {
5452 					/* do NOT clear "PNX"! */
5453 					tmplate |= ARM_PTE_NX;
5454 				} else {
5455 					/* do NOT clear "NX"! */
5456 					tmplate |= pt_attr_leaf_x(pt_attr);
5457 					if (set_XO) {
5458 						tmplate &= ~ARM_PTE_APMASK;
5459 						tmplate |= pt_attr_leaf_rona(pt_attr);
5460 					}
5461 				}
5462 			}
5463 
5464 #if DEVELOPMENT || DEBUG
5465 			if (force_write) {
5466 				/*
5467 				 * TODO: Run CS/Monitor checks here.
5468 				 */
5469 				if (managed) {
5470 					/*
5471 					 * We are marking the page as writable,
5472 					 * so we consider it to be modified and
5473 					 * referenced.
5474 					 */
5475 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5476 					tmplate |= ARM_PTE_AF;
5477 
5478 					if (ppattr_test_reffault(pai)) {
5479 						ppattr_clear_reffault(pai);
5480 					}
5481 
5482 					if (ppattr_test_modfault(pai)) {
5483 						ppattr_clear_modfault(pai);
5484 					}
5485 				}
5486 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5487 				/*
5488 				 * An immediate request for anything other than
5489 				 * write should still mark the page as
5490 				 * referenced if managed.
5491 				 */
5492 				if (managed) {
5493 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5494 					tmplate |= ARM_PTE_AF;
5495 
5496 					if (ppattr_test_reffault(pai)) {
5497 						ppattr_clear_reffault(pai);
5498 					}
5499 				}
5500 			}
5501 #endif
5502 
5503 			/* We do not expect to write fast fault the entry. */
5504 			pte_set_was_writeable(tmplate, false);
5505 #if HAS_FEAT_XS
5506 			if (pte_is_xs(pt_attr, spte)) {
5507 				need_strong_sync = true;
5508 			}
5509 #endif /* HAS_FEAT_XS */
5510 
5511 			write_pte_fast(pte_p, tmplate);
5512 
5513 			if (managed) {
5514 				pvh_assert_locked(pai);
5515 				pvh_unlock(pai);
5516 			}
5517 		}
5518 		FLUSH_PTE_STRONG();
5519 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5520 	} else {
5521 		va = end;
5522 	}
5523 
5524 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5525 	return va;
5526 }
5527 
5528 void
5529 pmap_protect_options(
5530 	pmap_t pmap,
5531 	vm_map_address_t b,
5532 	vm_map_address_t e,
5533 	vm_prot_t prot,
5534 	unsigned int options,
5535 	__unused void *args)
5536 {
5537 	vm_map_address_t l, beg;
5538 
5539 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5540 
5541 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5542 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5543 		    pmap, (uint64_t)b, (uint64_t)e);
5544 	}
5545 
5546 	/*
5547 	 * We allow single-page requests to execute non-preemptibly,
5548 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5549 	 * operation, and there are a couple of special use cases that
5550 	 * require a non-preemptible single-page operation.
5551 	 */
5552 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5553 		pmap_verify_preemptible();
5554 	}
5555 
5556 #if DEVELOPMENT || DEBUG
5557 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5558 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5559 			pmap_remove_options(pmap, b, e, options);
5560 			return;
5561 		}
5562 	} else
5563 #endif
5564 	{
5565 		/* Determine the new protection. */
5566 		switch (prot) {
5567 		case VM_PROT_EXECUTE:
5568 		case VM_PROT_READ:
5569 		case VM_PROT_READ | VM_PROT_EXECUTE:
5570 			break;
5571 		case VM_PROT_READ | VM_PROT_WRITE:
5572 		case VM_PROT_ALL:
5573 			return;         /* nothing to do */
5574 		default:
5575 			pmap_remove_options(pmap, b, e, options);
5576 			return;
5577 		}
5578 	}
5579 
5580 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5581 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5582 	    VM_KERNEL_ADDRHIDE(e));
5583 
5584 	beg = b;
5585 
5586 	while (beg < e) {
5587 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5588 
5589 		if (l > e) {
5590 			l = e;
5591 		}
5592 
5593 #if XNU_MONITOR
5594 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5595 #else
5596 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5597 #endif
5598 	}
5599 
5600 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5601 }
5602 
5603 /**
5604  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5605  *
5606  * @param pmap pmap to insert the pages into.
5607  * @param va virtual address to map the pages into.
5608  * @param pa page number of the first physical page to map.
5609  * @param size block size, in number of pages.
5610  * @param prot mapping protection attributes.
5611  * @param attr flags to pass to pmap_enter().
5612  *
5613  * @return KERN_SUCCESS.
5614  */
5615 kern_return_t
5616 pmap_map_block(
5617 	pmap_t pmap,
5618 	addr64_t va,
5619 	ppnum_t pa,
5620 	uint32_t size,
5621 	vm_prot_t prot,
5622 	int attr,
5623 	unsigned int flags)
5624 {
5625 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5626 }
5627 
5628 /**
5629  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5630  * As opposed to pmap_map_block(), this function takes
5631  * a physical address as an input and operates using the
5632  * page size associated with the input pmap.
5633  *
5634  * @param pmap pmap to insert the pages into.
5635  * @param va virtual address to map the pages into.
5636  * @param pa physical address of the first physical page to map.
5637  * @param size block size, in number of pages.
5638  * @param prot mapping protection attributes.
5639  * @param attr flags to pass to pmap_enter().
5640  *
5641  * @return KERN_SUCCESS.
5642  */
5643 kern_return_t
5644 pmap_map_block_addr(
5645 	pmap_t pmap,
5646 	addr64_t va,
5647 	pmap_paddr_t pa,
5648 	uint32_t size,
5649 	vm_prot_t prot,
5650 	int attr,
5651 	unsigned int flags)
5652 {
5653 #if __ARM_MIXED_PAGE_SIZE__
5654 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5655 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5656 #else
5657 	const uint64_t pmap_page_size = PAGE_SIZE;
5658 #endif
5659 
5660 	for (ppnum_t page = 0; page < size; page++) {
5661 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5662 			panic("%s: failed pmap_enter_addr, "
5663 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5664 			    __FUNCTION__,
5665 			    pmap, va, (uint64_t)pa, size, prot, flags);
5666 		}
5667 
5668 		va += pmap_page_size;
5669 		pa += pmap_page_size;
5670 	}
5671 
5672 	return KERN_SUCCESS;
5673 }
5674 
5675 kern_return_t
5676 pmap_enter_addr(
5677 	pmap_t pmap,
5678 	vm_map_address_t v,
5679 	pmap_paddr_t pa,
5680 	vm_prot_t prot,
5681 	vm_prot_t fault_type,
5682 	unsigned int flags,
5683 	boolean_t wired)
5684 {
5685 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5686 }
5687 
5688 /*
5689  *	Insert the given physical page (p) at
5690  *	the specified virtual address (v) in the
5691  *	target physical map with the protection requested.
5692  *
5693  *	If specified, the page will be wired down, meaning
5694  *	that the related pte can not be reclaimed.
5695  *
5696  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5697  *	or lose information.  That is, this routine must actually
5698  *	insert this page into the given map eventually (must make
5699  *	forward progress eventually.
5700  */
5701 kern_return_t
5702 pmap_enter(
5703 	pmap_t pmap,
5704 	vm_map_address_t v,
5705 	ppnum_t pn,
5706 	vm_prot_t prot,
5707 	vm_prot_t fault_type,
5708 	unsigned int flags,
5709 	boolean_t wired,
5710 	__unused pmap_mapping_type_t mapping_type)
5711 {
5712 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5713 }
5714 
5715 /*
5716  * Attempt to commit the pte.
5717  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5718  * Performs no page table or accounting writes on failures.
5719  */
5720 static inline bool
5721 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5722 {
5723 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5724 	bool success = false, changed_wiring = false;
5725 
5726 	__unreachable_ok_push
5727 	if (TEST_PAGE_RATIO_4) {
5728 		/*
5729 		 * 16K virtual pages w/ 4K hw pages.
5730 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5731 		 * As a result we require the exclusive pmap lock.
5732 		 */
5733 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5734 		*old_pte = *pte_p;
5735 		if (*old_pte == new_pte) {
5736 			/* Another thread completed this operation. Nothing to do here. */
5737 			success = true;
5738 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5739 		    (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5740 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5741 			success = false;
5742 		} else {
5743 			write_pte_fast(pte_p, new_pte);
5744 			success = true;
5745 		}
5746 	} else {
5747 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5748 	}
5749 	__unreachable_ok_pop
5750 
5751 	if (success && *old_pte != new_pte) {
5752 		if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5753 			bool need_strong_sync = false;
5754 			FLUSH_PTE_STRONG();
5755 #if HAS_FEAT_XS
5756 			if (pte_is_xs(pt_attr, *old_pte)) {
5757 				need_strong_sync = true;
5758 			}
5759 #endif /* HAS_FEAT_XS */
5760 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5761 		} else {
5762 			FLUSH_PTE();
5763 			__builtin_arm_isb(ISB_SY);
5764 		}
5765 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5766 		    (new_pte & ARM_PTE_WIRED) != 0 :
5767 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5768 
5769 		if (pmap != kernel_pmap && changed_wiring) {
5770 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5771 			if (new_pte & ARM_PTE_WIRED) {
5772 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5773 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5774 			} else {
5775 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5776 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5777 			}
5778 		}
5779 
5780 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5781 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5782 	}
5783 	return success;
5784 }
5785 
5786 MARK_AS_PMAP_TEXT static pt_entry_t
5787 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5788 {
5789 	pt_entry_t pte;
5790 
5791 	switch (wimg & (VM_WIMG_MASK)) {
5792 	case VM_WIMG_IO:
5793 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5794 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5795 		// AP, while preserving the security benefits of using device
5796 		// mapping against side-channel attacks. On pre-H14 platforms,
5797 		// the accesses will still be strongly ordered.
5798 		if (is_dram_addr(pa)) {
5799 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5800 		} else {
5801 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5802 		}
5803 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5804 		break;
5805 	case VM_WIMG_RT:
5806 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5807 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5808 		break;
5809 	case VM_WIMG_POSTED:
5810 		if (is_dram_addr(pa)) {
5811 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5812 		} else {
5813 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5814 		}
5815 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5816 		break;
5817 	case VM_WIMG_POSTED_REORDERED:
5818 		if (is_dram_addr(pa)) {
5819 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5820 		} else {
5821 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5822 		}
5823 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5824 		break;
5825 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5826 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5827 #if HAS_FEAT_XS
5828 		if (!is_dram_addr(pa)) {
5829 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5830 		}
5831 #endif /* HAS_FEAT_XS */
5832 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5833 		break;
5834 	case VM_WIMG_WCOMB:
5835 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5836 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5837 		break;
5838 	case VM_WIMG_WTHRU:
5839 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5840 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5841 		break;
5842 	case VM_WIMG_COPYBACK:
5843 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5844 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5845 		break;
5846 	case VM_WIMG_INNERWBACK:
5847 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5848 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5849 		break;
5850 	default:
5851 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5852 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5853 	}
5854 
5855 	return pte;
5856 }
5857 
5858 
5859 /*
5860  * Construct a PTE (and the physical page attributes) for the given virtual to
5861  * physical mapping.
5862  *
5863  * This function has no side effects and is safe to call so that it is safe to
5864  * call while attempting a pmap_enter transaction.
5865  */
5866 MARK_AS_PMAP_TEXT static pt_entry_t
5867 pmap_construct_pte(
5868 	const pmap_t pmap,
5869 	vm_map_address_t va,
5870 	pmap_paddr_t pa,
5871 	vm_prot_t prot,
5872 	vm_prot_t fault_type,
5873 	boolean_t wired,
5874 	const pt_attr_t* const pt_attr,
5875 	uint16_t *pp_attr_bits /* OUTPUT */
5876 	)
5877 {
5878 	bool set_NX = false, set_XO = false;
5879 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5880 	assert(pp_attr_bits != NULL);
5881 	*pp_attr_bits = 0;
5882 
5883 	if (wired) {
5884 		pte |= ARM_PTE_WIRED;
5885 	}
5886 
5887 #if DEVELOPMENT || DEBUG
5888 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5889 #else
5890 	if ((prot & VM_PROT_EXECUTE))
5891 #endif
5892 	{
5893 		set_NX = false;
5894 	} else {
5895 		set_NX = true;
5896 	}
5897 
5898 	if (prot == VM_PROT_EXECUTE) {
5899 		set_XO = true;
5900 	}
5901 
5902 	if (set_NX) {
5903 		pte |= pt_attr_leaf_xn(pt_attr);
5904 	} else {
5905 		if (pmap == kernel_pmap) {
5906 			pte |= ARM_PTE_NX;
5907 		} else {
5908 			pte |= pt_attr_leaf_x(pt_attr);
5909 		}
5910 	}
5911 
5912 	if (pmap == kernel_pmap) {
5913 #if __ARM_KERNEL_PROTECT__
5914 		pte |= ARM_PTE_NG;
5915 #endif /* __ARM_KERNEL_PROTECT__ */
5916 		if (prot & VM_PROT_WRITE) {
5917 			pte |= ARM_PTE_AP(AP_RWNA);
5918 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5919 		} else {
5920 			pte |= ARM_PTE_AP(AP_RONA);
5921 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5922 		}
5923 	} else {
5924 		if (pmap->type != PMAP_TYPE_NESTED) {
5925 			pte |= ARM_PTE_NG;
5926 		} else if ((pmap->nested_region_unnested_table_bitmap)
5927 		    && (va >= pmap->nested_region_addr)
5928 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5929 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
5930 
5931 			if ((pmap->nested_region_unnested_table_bitmap)
5932 			    && testbit(index, (int *)pmap->nested_region_unnested_table_bitmap)) {
5933 				pte |= ARM_PTE_NG;
5934 			}
5935 		}
5936 		if (prot & VM_PROT_WRITE) {
5937 			assert(pmap->type != PMAP_TYPE_NESTED);
5938 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5939 				if (fault_type & VM_PROT_WRITE) {
5940 					pte |= pt_attr_leaf_rw(pt_attr);
5941 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5942 				} else {
5943 					pte |= pt_attr_leaf_ro(pt_attr);
5944 					/*
5945 					 * Mark the page as MODFAULT so that a subsequent write
5946 					 * may be handled through arm_fast_fault().
5947 					 */
5948 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5949 					pte_set_was_writeable(pte, true);
5950 				}
5951 			} else {
5952 				pte |= pt_attr_leaf_rw(pt_attr);
5953 				*pp_attr_bits |= PP_ATTR_REFERENCED;
5954 			}
5955 		} else {
5956 			if (set_XO) {
5957 				pte |= pt_attr_leaf_rona(pt_attr);
5958 			} else {
5959 				pte |= pt_attr_leaf_ro(pt_attr);
5960 			}
5961 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5962 		}
5963 	}
5964 
5965 	pte |= ARM_PTE_AF;
5966 	return pte;
5967 }
5968 
5969 MARK_AS_PMAP_TEXT kern_return_t
5970 pmap_enter_options_internal(
5971 	pmap_t pmap,
5972 	vm_map_address_t v,
5973 	pmap_paddr_t pa,
5974 	vm_prot_t prot,
5975 	vm_prot_t fault_type,
5976 	unsigned int flags,
5977 	boolean_t wired,
5978 	unsigned int options)
5979 {
5980 	ppnum_t         pn = (ppnum_t)atop(pa);
5981 	pt_entry_t      pte;
5982 	pt_entry_t      spte;
5983 	pt_entry_t      *pte_p;
5984 	bool            refcnt_updated;
5985 	bool            wiredcnt_updated;
5986 	bool            ro_va = false;
5987 	unsigned int    wimg_bits;
5988 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
5989 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
5990 	kern_return_t   kr = KERN_SUCCESS;
5991 	uint16_t pp_attr_bits;
5992 	volatile uint16_t *refcnt;
5993 	volatile uint16_t *wiredcnt;
5994 	pv_free_list_t *local_pv_free;
5995 
5996 	validate_pmap_mutable(pmap);
5997 
5998 #if XNU_MONITOR
5999 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6000 		panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6001 	}
6002 #endif
6003 
6004 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6005 
6006 	if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6007 		panic("%s: pmap %p v 0x%llx not page-aligned",
6008 		    __func__, pmap, (unsigned long long)v);
6009 	}
6010 
6011 	if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6012 		panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6013 	}
6014 
6015 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6016 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6017 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6018 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6019 	}
6020 
6021 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6022 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6023 		    pmap, (uint64_t)pa);
6024 	}
6025 
6026 	/* The PA should not extend beyond the architected physical address space */
6027 	pa &= ARM_PTE_PAGE_MASK;
6028 
6029 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6030 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6031 		extern vm_offset_t ctrr_test_page;
6032 		if (__probable(v != ctrr_test_page))
6033 #endif
6034 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6035 	}
6036 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6037 		if (__improbable(prot != VM_PROT_READ)) {
6038 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6039 			    __func__, (unsigned long long)v, prot);
6040 		}
6041 		ro_va = true;
6042 	}
6043 	assert(pn != vm_page_fictitious_addr);
6044 
6045 	refcnt_updated = false;
6046 	wiredcnt_updated = false;
6047 
6048 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6049 		/*
6050 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6051 		 *
6052 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6053 		 */
6054 		lock_mode = PMAP_LOCK_EXCLUSIVE;
6055 	}
6056 
6057 	if (!pmap_lock_preempt(pmap, lock_mode)) {
6058 		return KERN_ABORTED;
6059 	}
6060 
6061 	/*
6062 	 *	Expand pmap to include this pte.  Assume that
6063 	 *	pmap is always expanded to include enough hardware
6064 	 *	pages to map one VM page.
6065 	 */
6066 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6067 		/* Must unlock to expand the pmap. */
6068 		pmap_unlock(pmap, lock_mode);
6069 
6070 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6071 
6072 		if (kr != KERN_SUCCESS) {
6073 			return kr;
6074 		}
6075 
6076 		if (!pmap_lock_preempt(pmap, lock_mode)) {
6077 			return KERN_ABORTED;
6078 		}
6079 	}
6080 
6081 	if (options & PMAP_OPTIONS_NOENTER) {
6082 		pmap_unlock(pmap, lock_mode);
6083 		return KERN_SUCCESS;
6084 	}
6085 
6086 	/*
6087 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6088 	 * done via a cmpxchg loop.
6089 	 * We need to be careful about modifying non-local data structures before commiting
6090 	 * the new pte since we may need to re-do the transaction.
6091 	 */
6092 	spte = os_atomic_load(pte_p, relaxed);
6093 	while (!committed) {
6094 		refcnt = NULL;
6095 		wiredcnt = NULL;
6096 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6097 		had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6098 
6099 		if (pmap != kernel_pmap) {
6100 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
6101 			refcnt = &ptd_info->refcnt;
6102 			wiredcnt = &ptd_info->wiredcnt;
6103 			/*
6104 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6105 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6106 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6107 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6108 			 * have PTDs, so we can't use the check there.
6109 			 */
6110 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6111 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6112 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6113 			}
6114 			/*
6115 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
6116 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6117 			 * or acquire the pmap lock exclusive.
6118 			 */
6119 			if (!wiredcnt_updated) {
6120 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6121 				wiredcnt_updated = true;
6122 			}
6123 			if (!refcnt_updated) {
6124 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
6125 				refcnt_updated = true;
6126 				drop_refcnt = true;
6127 			}
6128 		}
6129 
6130 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6131 			/*
6132 			 * There is already a mapping here & it's for a different physical page.
6133 			 * First remove that mapping.
6134 			 *
6135 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6136 			 */
6137 			if (lock_mode == PMAP_LOCK_SHARED) {
6138 				if (pmap_lock_shared_to_exclusive(pmap)) {
6139 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6140 				} else {
6141 					/*
6142 					 * We failed to upgrade to an exclusive lock.
6143 					 * As a result we no longer hold the lock at all,
6144 					 * so we need to re-acquire it and restart the transaction.
6145 					 */
6146 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6147 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6148 					/* pmap might have changed after we dropped the lock. Try again. */
6149 					spte = os_atomic_load(pte_p, relaxed);
6150 					continue;
6151 				}
6152 			}
6153 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6154 			spte = ARM_PTE_TYPE_FAULT;
6155 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6156 		}
6157 
6158 		/*
6159 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6160 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6161 		 * read-write protection. The PMAP layer though still needs to use the right
6162 		 * index, which is the older XO-now-TPRO one and that is specially selected
6163 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6164 		 */
6165 		if (options & PMAP_OPTIONS_MAP_TPRO) {
6166 			if (__improbable(pmap == kernel_pmap)) {
6167 				panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6168 				    __func__);
6169 			}
6170 			pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6171 		} else {
6172 			pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6173 		}
6174 
6175 		if (pa_valid(pa)) {
6176 			unsigned int pai;
6177 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6178 
6179 			is_internal = FALSE;
6180 			is_altacct = FALSE;
6181 
6182 			pai = pa_index(pa);
6183 
6184 			pvh_lock(pai);
6185 
6186 			/*
6187 			 * Make sure that the current per-cpu PV free list has
6188 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6189 			 * if the transaction succeeds. We're either in the
6190 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6191 			 * Note that we can still be interrupted, but a primary
6192 			 * interrupt handler can never enter the pmap.
6193 			 */
6194 #if !XNU_MONITOR
6195 			assert(get_preemption_level() > 0);
6196 #endif
6197 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6198 			pv_entry_t **pv_h = pai_to_pvh(pai);
6199 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6200 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6201 
6202 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6203 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6204 				int new_allocated_pves = 0;
6205 
6206 				while (new_allocated_pves < 2) {
6207 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6208 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6209 					if (pv_status == PV_ALLOC_FAIL) {
6210 						break;
6211 					} else if (pv_status == PV_ALLOC_RETRY) {
6212 						/*
6213 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6214 						 * it will have dropped the pmap lock while doing so.
6215 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6216 						 * be on a different CPU now.
6217 						 */
6218 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6219 					} else {
6220 						/* If we've gotten this far then a node should've been allocated. */
6221 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6222 
6223 						new_allocated_pves++;
6224 					}
6225 				}
6226 
6227 				for (int i = 0; i < new_allocated_pves; i++) {
6228 					pv_free(new_pve_p[i]);
6229 				}
6230 			}
6231 
6232 			if (pv_status == PV_ALLOC_FAIL) {
6233 				pvh_unlock(pai);
6234 				kr = KERN_RESOURCE_SHORTAGE;
6235 				break;
6236 			} else if (pv_status == PV_ALLOC_RETRY) {
6237 				pvh_unlock(pai);
6238 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6239 				spte = os_atomic_load(pte_p, relaxed);
6240 				continue;
6241 			}
6242 
6243 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6244 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6245 			} else {
6246 				wimg_bits = pmap_cache_attributes(pn);
6247 			}
6248 
6249 			/* We may be retrying this operation after dropping the PVH lock.
6250 			 * Cache attributes for the physical page may have changed while the lock
6251 			 * was dropped, so clear any cache attributes we may have previously set
6252 			 * in the PTE template. */
6253 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6254 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6255 
6256 #if XNU_MONITOR
6257 			/* The regular old kernel is not allowed to remap PPL pages. */
6258 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6259 				panic("%s: page belongs to PPL, "
6260 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6261 				    __FUNCTION__,
6262 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6263 			}
6264 
6265 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6266 				panic("%s: page locked down, "
6267 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6268 				    __FUNCTION__,
6269 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6270 			}
6271 #endif
6272 
6273 
6274 
6275 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6276 			if (!committed) {
6277 				pvh_unlock(pai);
6278 				continue;
6279 			}
6280 			had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6281 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6282 
6283 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6284 			/*
6285 			 * If there was already a valid pte here then we reuse its reference
6286 			 * on the ptd and drop the one that we took above.
6287 			 */
6288 			drop_refcnt = had_valid_mapping;
6289 
6290 			if (!had_valid_mapping) {
6291 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6292 				int pve_ptep_idx = 0;
6293 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6294 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6295 				if (pv_status != PV_ALLOC_SUCCESS) {
6296 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6297 					    __func__, pv_status, new_pve_p, pmap);
6298 				}
6299 
6300 				if (pmap != kernel_pmap) {
6301 					if (options & PMAP_OPTIONS_INTERNAL) {
6302 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6303 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6304 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6305 							/*
6306 							 * Make a note to ourselves that this
6307 							 * mapping is using alternative
6308 							 * accounting. We'll need this in order
6309 							 * to know which ledger to debit when
6310 							 * the mapping is removed.
6311 							 *
6312 							 * The altacct bit must be set while
6313 							 * the pv head is locked. Defer the
6314 							 * ledger accounting until after we've
6315 							 * dropped the lock.
6316 							 */
6317 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6318 							is_altacct = TRUE;
6319 						}
6320 					}
6321 					if (ppattr_test_reusable(pai) &&
6322 					    !is_altacct) {
6323 						is_reusable = TRUE;
6324 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6325 						is_internal = TRUE;
6326 					} else {
6327 						is_external = TRUE;
6328 					}
6329 				}
6330 			}
6331 
6332 			pvh_unlock(pai);
6333 
6334 			if (pp_attr_bits != 0) {
6335 				ppattr_pa_set_bits(pa, pp_attr_bits);
6336 			}
6337 
6338 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6339 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6340 
6341 				if (is_internal) {
6342 					/*
6343 					 * Make corresponding adjustments to
6344 					 * phys_footprint statistics.
6345 					 */
6346 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6347 					if (is_altacct) {
6348 						/*
6349 						 * If this page is internal and
6350 						 * in an IOKit region, credit
6351 						 * the task's total count of
6352 						 * dirty, internal IOKit pages.
6353 						 * It should *not* count towards
6354 						 * the task's total physical
6355 						 * memory footprint, because
6356 						 * this entire region was
6357 						 * already billed to the task
6358 						 * at the time the mapping was
6359 						 * created.
6360 						 *
6361 						 * Put another way, this is
6362 						 * internal++ and
6363 						 * alternate_accounting++, so
6364 						 * net effect on phys_footprint
6365 						 * is 0. That means: don't
6366 						 * touch phys_footprint here.
6367 						 */
6368 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6369 					} else {
6370 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6371 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6372 							skip_footprint_debit = true;
6373 						} else {
6374 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6375 						}
6376 					}
6377 				}
6378 				if (is_reusable) {
6379 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6380 				} else if (is_external) {
6381 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6382 				}
6383 			}
6384 		} else {
6385 			if (prot & VM_PROT_EXECUTE) {
6386 				kr = KERN_FAILURE;
6387 				break;
6388 			}
6389 
6390 			wimg_bits = pmap_cache_attributes(pn);
6391 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6392 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6393 			}
6394 
6395 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6396 
6397 #if XNU_MONITOR
6398 			pte = pmap_construct_io_pte(pa, pte);
6399 
6400 			/**
6401 			 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6402 			 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6403 			 * created and later removed.  We must therefore prevent an attacker from downgrading a
6404 			 * a writable mapping in order to allow it to be removed and remapped to something else.
6405 			 */
6406 			if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6407 			    ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6408 			    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6409 			    (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6410 				panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6411 				    __func__, (uint64_t)pte_to_pa(spte));
6412 			}
6413 #endif
6414 
6415 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6416 			if (committed) {
6417 				had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6418 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6419 
6420 				/**
6421 				 * If there was already a valid pte here then we reuse its
6422 				 * reference on the ptd and drop the one that we took above.
6423 				 */
6424 				drop_refcnt = had_valid_mapping;
6425 			}
6426 		}
6427 		if (committed) {
6428 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6429 				assert(pmap != kernel_pmap);
6430 
6431 				/* One less "compressed" */
6432 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6433 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6434 
6435 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6436 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6437 				} else if (!skip_footprint_debit) {
6438 					/* Was part of the footprint */
6439 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6440 				}
6441 				/* The old entry held a reference so drop the extra one that we took above. */
6442 				drop_refcnt = true;
6443 			}
6444 		}
6445 	}
6446 
6447 	if (drop_refcnt && refcnt != NULL) {
6448 		assert(refcnt_updated);
6449 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6450 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6451 		}
6452 	}
6453 
6454 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6455 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6456 	}
6457 
6458 	pmap_unlock(pmap, lock_mode);
6459 
6460 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6461 		pmap_phys_write_disable(v);
6462 	}
6463 
6464 	return kr;
6465 }
6466 
6467 kern_return_t
6468 pmap_enter_options_addr(
6469 	pmap_t pmap,
6470 	vm_map_address_t v,
6471 	pmap_paddr_t pa,
6472 	vm_prot_t prot,
6473 	vm_prot_t fault_type,
6474 	unsigned int flags,
6475 	boolean_t wired,
6476 	unsigned int options,
6477 	__unused void   *arg,
6478 	__unused pmap_mapping_type_t mapping_type)
6479 {
6480 	kern_return_t kr = KERN_FAILURE;
6481 
6482 
6483 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6484 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6485 
6486 
6487 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6488 	do {
6489 #if XNU_MONITOR
6490 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6491 #else
6492 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6493 #endif
6494 
6495 		if (kr == KERN_RESOURCE_SHORTAGE) {
6496 #if XNU_MONITOR
6497 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6498 #endif
6499 			if (nowait_requested) {
6500 				break;
6501 			}
6502 		}
6503 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6504 
6505 #if XNU_MONITOR
6506 	pmap_ledger_check_balance(pmap);
6507 #endif
6508 
6509 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6510 
6511 	return kr;
6512 }
6513 
6514 kern_return_t
6515 pmap_enter_options(
6516 	pmap_t pmap,
6517 	vm_map_address_t v,
6518 	ppnum_t pn,
6519 	vm_prot_t prot,
6520 	vm_prot_t fault_type,
6521 	unsigned int flags,
6522 	boolean_t wired,
6523 	unsigned int options,
6524 	__unused void   *arg,
6525 	pmap_mapping_type_t mapping_type)
6526 {
6527 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6528 }
6529 
6530 /*
6531  *	Routine:	pmap_change_wiring
6532  *	Function:	Change the wiring attribute for a map/virtual-address
6533  *			pair.
6534  *	In/out conditions:
6535  *			The mapping must already exist in the pmap.
6536  */
6537 MARK_AS_PMAP_TEXT kern_return_t
6538 pmap_change_wiring_internal(
6539 	pmap_t pmap,
6540 	vm_map_address_t v,
6541 	boolean_t wired)
6542 {
6543 	pt_entry_t     *pte_p;
6544 	pmap_paddr_t    pa;
6545 
6546 	validate_pmap_mutable(pmap);
6547 
6548 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6549 		return KERN_ABORTED;
6550 	}
6551 
6552 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6553 
6554 	pte_p = pmap_pte(pmap, v);
6555 	if (pte_p == PT_ENTRY_NULL) {
6556 		if (!wired) {
6557 			/*
6558 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6559 			 * may have been freed by a remove operation.
6560 			 */
6561 			goto pmap_change_wiring_return;
6562 		} else {
6563 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6564 		}
6565 	}
6566 	/*
6567 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6568 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6569 	 */
6570 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6571 
6572 	while (pa_valid(pa)) {
6573 		pmap_paddr_t new_pa;
6574 
6575 		pvh_lock(pa_index(pa));
6576 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6577 
6578 		if (pa == new_pa) {
6579 			break;
6580 		}
6581 
6582 		pvh_unlock(pa_index(pa));
6583 		pa = new_pa;
6584 	}
6585 
6586 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6587 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6588 		if (!wired) {
6589 			/* PTE cleared by prior remove/disconnect operation */
6590 			goto pmap_change_wiring_cleanup;
6591 		} else {
6592 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6593 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6594 		}
6595 	}
6596 
6597 	assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6598 	if (wired != pte_is_wired(*pte_p)) {
6599 		pte_set_wired(pmap, pte_p, wired);
6600 		if (pmap != kernel_pmap) {
6601 			if (wired) {
6602 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6603 			} else if (!wired) {
6604 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6605 			}
6606 		}
6607 	}
6608 
6609 pmap_change_wiring_cleanup:
6610 	if (pa_valid(pa)) {
6611 		pvh_unlock(pa_index(pa));
6612 	}
6613 
6614 pmap_change_wiring_return:
6615 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6616 
6617 	return KERN_SUCCESS;
6618 }
6619 
6620 void
6621 pmap_change_wiring(
6622 	pmap_t pmap,
6623 	vm_map_address_t v,
6624 	boolean_t wired)
6625 {
6626 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6627 	pmap_verify_preemptible();
6628 
6629 	kern_return_t kr = KERN_FAILURE;
6630 #if XNU_MONITOR
6631 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6632 	do {
6633 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6634 	} while (kr == KERN_ABORTED);
6635 
6636 	pmap_ledger_check_balance(pmap);
6637 #else
6638 	/* Since we verified preemptibility, call the helper only once. */
6639 	kr = pmap_change_wiring_internal(pmap, v, wired);
6640 #endif
6641 
6642 	if (kr != KERN_SUCCESS) {
6643 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6644 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6645 	}
6646 }
6647 
6648 MARK_AS_PMAP_TEXT pmap_paddr_t
6649 pmap_find_pa_internal(
6650 	pmap_t pmap,
6651 	addr64_t va)
6652 {
6653 	pmap_paddr_t    pa = 0;
6654 
6655 	validate_pmap(pmap);
6656 
6657 	if (pmap != kernel_pmap) {
6658 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6659 	}
6660 
6661 	pa = pmap_vtophys(pmap, va);
6662 
6663 	if (pmap != kernel_pmap) {
6664 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6665 	}
6666 
6667 	return pa;
6668 }
6669 
6670 pmap_paddr_t
6671 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6672 {
6673 	pmap_paddr_t pa = 0;
6674 
6675 	if (pmap == kernel_pmap) {
6676 		pa = mmu_kvtop(va);
6677 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6678 		/*
6679 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6680 		 * translation even if PAN would prevent kernel access through the translation.
6681 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6682 		 */
6683 		pa = mmu_uvtop(va);
6684 	}
6685 	return pa;
6686 }
6687 
6688 pmap_paddr_t
6689 pmap_find_pa(
6690 	pmap_t pmap,
6691 	addr64_t va)
6692 {
6693 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6694 
6695 	if (pa != 0) {
6696 		return pa;
6697 	}
6698 
6699 	if (not_in_kdp) {
6700 #if XNU_MONITOR
6701 		return pmap_find_pa_ppl(pmap, va);
6702 #else
6703 		return pmap_find_pa_internal(pmap, va);
6704 #endif
6705 	} else {
6706 		return pmap_vtophys(pmap, va);
6707 	}
6708 }
6709 
6710 ppnum_t
6711 pmap_find_phys_nofault(
6712 	pmap_t pmap,
6713 	addr64_t va)
6714 {
6715 	ppnum_t ppn;
6716 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6717 	return ppn;
6718 }
6719 
6720 ppnum_t
6721 pmap_find_phys(
6722 	pmap_t pmap,
6723 	addr64_t va)
6724 {
6725 	ppnum_t ppn;
6726 	ppn = atop(pmap_find_pa(pmap, va));
6727 	return ppn;
6728 }
6729 
6730 /**
6731  * Translate a kernel virtual address into a physical address.
6732  *
6733  * @param va The kernel virtual address to translate. Does not work on user
6734  *           virtual addresses.
6735  *
6736  * @return The physical address if the translation was successful, or zero if
6737  *         no valid mappings were found for the given virtual address.
6738  */
6739 pmap_paddr_t
6740 kvtophys(vm_offset_t va)
6741 {
6742 	/**
6743 	 * Attempt to do the translation first in hardware using the AT (address
6744 	 * translation) instruction. This will attempt to use the MMU to do the
6745 	 * translation for us.
6746 	 */
6747 	pmap_paddr_t pa = mmu_kvtop(va);
6748 
6749 	if (pa) {
6750 		return pa;
6751 	}
6752 
6753 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6754 	return pmap_vtophys(kernel_pmap, va);
6755 }
6756 
6757 /**
6758  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6759  * points to a non-kernel-managed physical page, then this call will panic().
6760  *
6761  * @note The output of this function is guaranteed to be a kernel-managed
6762  *       physical page, which means it's safe to pass the output directly to
6763  *       pa_index() to create a physical address index for various pmap data
6764  *       structures.
6765  *
6766  * @param va The kernel virtual address to translate. Does not work on user
6767  *           virtual addresses.
6768  *
6769  * @return The translated physical address for the given virtual address.
6770  */
6771 pmap_paddr_t
6772 kvtophys_nofail(vm_offset_t va)
6773 {
6774 	pmap_paddr_t pa = kvtophys(va);
6775 
6776 	if (!pa_valid(pa)) {
6777 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6778 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6779 	}
6780 
6781 	return pa;
6782 }
6783 
6784 pmap_paddr_t
6785 pmap_vtophys(
6786 	pmap_t pmap,
6787 	addr64_t va)
6788 {
6789 	if ((va < pmap->min) || (va >= pmap->max)) {
6790 		return 0;
6791 	}
6792 
6793 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6794 
6795 	tt_entry_t * ttp = NULL;
6796 	tt_entry_t * ttep = NULL;
6797 	tt_entry_t   tte = ARM_TTE_EMPTY;
6798 	pmap_paddr_t pa = 0;
6799 	unsigned int cur_level;
6800 
6801 	ttp = pmap->tte;
6802 
6803 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6804 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6805 
6806 		tte = *ttep;
6807 
6808 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6809 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6810 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6811 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6812 
6813 		if ((tte & valid_mask) != valid_mask) {
6814 			return (pmap_paddr_t) 0;
6815 		}
6816 
6817 		/* This detects both leaf entries and intermediate block mappings. */
6818 		if ((tte & type_mask) == type_block) {
6819 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6820 			break;
6821 		}
6822 
6823 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6824 	}
6825 
6826 	return pa;
6827 }
6828 
6829 /*
6830  *	pmap_init_pte_page - Initialize a page table page.
6831  */
6832 MARK_AS_PMAP_TEXT void
6833 pmap_init_pte_page(
6834 	pmap_t pmap,
6835 	pt_entry_t *pte_p,
6836 	vm_offset_t va,
6837 	unsigned int ttlevel,
6838 	boolean_t alloc_ptd)
6839 {
6840 	pt_desc_t   *ptdp = NULL;
6841 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6842 
6843 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6844 		if (alloc_ptd) {
6845 			/*
6846 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6847 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6848 			 * bootstrap request, so we check for an existing PTD here.
6849 			 */
6850 			ptdp = ptd_alloc(pmap);
6851 			if (ptdp == NULL) {
6852 				panic("%s: unable to allocate PTD", __func__);
6853 			}
6854 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6855 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6856 			pvh_set_flags(pvh, 0);
6857 		} else {
6858 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
6859 		}
6860 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6861 		ptdp = pvh_ptd(pvh);
6862 	} else {
6863 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6864 	}
6865 
6866 	// below barrier ensures previous updates to the page are visible to PTW before
6867 	// it is linked to the PTE of previous level
6868 	__builtin_arm_dmb(DMB_ISHST);
6869 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6870 }
6871 
6872 /*
6873  *	Routine:	pmap_expand
6874  *
6875  *	Expands a pmap to be able to map the specified virtual address.
6876  *
6877  *	Allocates new memory for the default (COARSE) translation table
6878  *	entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6879  *	also allocates space for the corresponding pv entries.
6880  *
6881  *	Nothing should be locked.
6882  */
6883 MARK_AS_PMAP_TEXT static kern_return_t
6884 pmap_expand(
6885 	pmap_t pmap,
6886 	vm_map_address_t v,
6887 	unsigned int options,
6888 	unsigned int level)
6889 {
6890 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6891 
6892 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6893 		return KERN_INVALID_ADDRESS;
6894 	}
6895 	pmap_paddr_t    pa;
6896 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
6897 	tt_entry_t              *tte_p;
6898 	tt_entry_t              *tt_p;
6899 
6900 	pa = 0x0ULL;
6901 	tt_p =  (tt_entry_t *)NULL;
6902 
6903 	for (; ttlevel < level; ttlevel++) {
6904 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6905 			return KERN_ABORTED;
6906 		}
6907 
6908 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6909 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6910 			kern_return_t ret;
6911 			while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
6912 				if (options & PMAP_OPTIONS_NOWAIT) {
6913 					/* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
6914 					return ret;
6915 				}
6916 #if XNU_MONITOR
6917 				panic("%s: failed to allocate tt, "
6918 				    "pmap=%p, v=%p, options=0x%x, level=%u",
6919 				    __FUNCTION__,
6920 				    pmap, (void *)v, options, level);
6921 #else
6922 				VM_PAGE_WAIT();
6923 #endif
6924 			}
6925 
6926 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6927 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6928 				return KERN_ABORTED;
6929 			}
6930 
6931 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6932 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6933 				pa = kvtophys_nofail((vm_offset_t)tt_p);
6934 				tte_p = pmap_ttne(pmap, ttlevel, v);
6935 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6936 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6937 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6938 				pa = 0x0ULL;
6939 				tt_p = (tt_entry_t *)NULL;
6940 			}
6941 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6942 		} else {
6943 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6944 		}
6945 
6946 		if (tt_p != (tt_entry_t *)NULL) {
6947 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6948 			tt_p = (tt_entry_t *)NULL;
6949 		}
6950 	}
6951 
6952 	return KERN_SUCCESS;
6953 }
6954 
6955 /*
6956  *	Routine:	pmap_gc
6957  *	Function:
6958  *              Pmap garbage collection
6959  *		Called by the pageout daemon when pages are scarce.
6960  *
6961  */
6962 void
6963 pmap_gc(void)
6964 {
6965 	/*
6966 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6967 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6968 	 * or may contain wired mappings.  However, with the relatively recent change to
6969 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6970 	 * page, it may make sense to call that function here.
6971 	 */
6972 }
6973 
6974 /*
6975  *      By default, don't attempt pmap GC more frequently
6976  *      than once / 1 minutes.
6977  */
6978 
6979 void
6980 compute_pmap_gc_throttle(
6981 	void *arg __unused)
6982 {
6983 }
6984 
6985 /*
6986  * pmap_attribute_cache_sync(vm_offset_t pa)
6987  *
6988  * Invalidates all of the instruction cache on a physical page and
6989  * pushes any dirty data from the data cache for the same physical page
6990  */
6991 
6992 kern_return_t
6993 pmap_attribute_cache_sync(
6994 	ppnum_t pp,
6995 	vm_size_t size,
6996 	__unused vm_machine_attribute_t attribute,
6997 	__unused vm_machine_attribute_val_t * value)
6998 {
6999 	if (size > PAGE_SIZE) {
7000 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7001 	} else {
7002 		cache_sync_page(pp);
7003 	}
7004 
7005 	return KERN_SUCCESS;
7006 }
7007 
7008 /*
7009  * pmap_sync_page_data_phys(ppnum_t pp)
7010  *
7011  * Invalidates all of the instruction cache on a physical page and
7012  * pushes any dirty data from the data cache for the same physical page
7013  */
7014 void
7015 pmap_sync_page_data_phys(
7016 	ppnum_t pp)
7017 {
7018 	cache_sync_page(pp);
7019 }
7020 
7021 /*
7022  * pmap_sync_page_attributes_phys(ppnum_t pp)
7023  *
7024  * Write back and invalidate all cachelines on a physical page.
7025  */
7026 void
7027 pmap_sync_page_attributes_phys(
7028 	ppnum_t pp)
7029 {
7030 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7031 }
7032 
7033 #if CONFIG_COREDUMP
7034 /* temporary workaround */
7035 boolean_t
7036 coredumpok(
7037 	vm_map_t map,
7038 	mach_vm_offset_t va)
7039 {
7040 	pt_entry_t     *pte_p;
7041 	pt_entry_t      spte;
7042 
7043 	pte_p = pmap_pte(map->pmap, va);
7044 	if (0 == pte_p) {
7045 		return FALSE;
7046 	}
7047 	if (vm_map_entry_has_device_pager(map, va)) {
7048 		return FALSE;
7049 	}
7050 	spte = *pte_p;
7051 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7052 }
7053 #endif
7054 
7055 void
7056 fillPage(
7057 	ppnum_t pn,
7058 	unsigned int fill)
7059 {
7060 	unsigned int   *addr;
7061 	int             count;
7062 
7063 	addr = (unsigned int *) phystokv(ptoa(pn));
7064 	count = PAGE_SIZE / sizeof(unsigned int);
7065 	while (count--) {
7066 		*addr++ = fill;
7067 	}
7068 }
7069 
7070 extern void     mapping_set_mod(ppnum_t pn);
7071 
7072 void
7073 mapping_set_mod(
7074 	ppnum_t pn)
7075 {
7076 	pmap_set_modify(pn);
7077 }
7078 
7079 extern void     mapping_set_ref(ppnum_t pn);
7080 
7081 void
7082 mapping_set_ref(
7083 	ppnum_t pn)
7084 {
7085 	pmap_set_reference(pn);
7086 }
7087 
7088 /*
7089  * Clear specified attribute bits.
7090  *
7091  * Try to force an arm_fast_fault() for all mappings of
7092  * the page - to force attributes to be set again at fault time.
7093  * If the forcing succeeds, clear the cached bits at the head.
7094  * Otherwise, something must have been wired, so leave the cached
7095  * attributes alone.
7096  */
7097 MARK_AS_PMAP_TEXT static void
7098 phys_attribute_clear_with_flush_range(
7099 	ppnum_t         pn,
7100 	unsigned int    bits,
7101 	int             options,
7102 	void            *arg,
7103 	pmap_tlb_flush_range_t *flush_range)
7104 {
7105 	pmap_paddr_t    pa = ptoa(pn);
7106 	vm_prot_t       allow_mode = VM_PROT_ALL;
7107 
7108 #if XNU_MONITOR
7109 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7110 		panic("%s: illegal request, "
7111 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7112 		    __FUNCTION__,
7113 		    pn, bits, options, arg, flush_range);
7114 	}
7115 #endif
7116 	if ((arg != NULL) || (flush_range != NULL)) {
7117 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7118 	}
7119 
7120 	if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7121 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7122 		    "invalid options",
7123 		    pn, bits, options, arg, flush_range);
7124 	}
7125 
7126 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7127 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7128 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7129 		    "should not clear 'modified' without flushing TLBs",
7130 		    pn, bits, options, arg, flush_range);
7131 	}
7132 
7133 	assert(pn != vm_page_fictitious_addr);
7134 
7135 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7136 		assert(bits == PP_ATTR_MODIFIED);
7137 
7138 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7139 		/*
7140 		 * We short circuit this case; it should not need to
7141 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7142 		 * pmap_page_protect has taken care of resetting
7143 		 * the state so that we'll see the next write as a fault to
7144 		 * the VM (i.e. we don't want a fast fault).
7145 		 */
7146 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7147 		return;
7148 	}
7149 	if (bits & PP_ATTR_REFERENCED) {
7150 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7151 	}
7152 	if (bits & PP_ATTR_MODIFIED) {
7153 		allow_mode &= ~VM_PROT_WRITE;
7154 	}
7155 
7156 	if (bits == PP_ATTR_NOENCRYPT) {
7157 		/*
7158 		 * We short circuit this case; it should not need to
7159 		 * invoke arm_force_fast_fault, so just clear and
7160 		 * return.  On ARM, this bit is just a debugging aid.
7161 		 */
7162 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7163 		return;
7164 	}
7165 
7166 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7167 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7168 	}
7169 }
7170 
7171 MARK_AS_PMAP_TEXT void
7172 phys_attribute_clear_internal(
7173 	ppnum_t         pn,
7174 	unsigned int    bits,
7175 	int             options,
7176 	void            *arg)
7177 {
7178 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7179 }
7180 
7181 #if __ARM_RANGE_TLBI__
7182 MARK_AS_PMAP_TEXT static vm_map_address_t
7183 phys_attribute_clear_twig_internal(
7184 	pmap_t pmap,
7185 	vm_map_address_t start,
7186 	vm_map_address_t end,
7187 	unsigned int bits,
7188 	unsigned int options,
7189 	pmap_tlb_flush_range_t *flush_range)
7190 {
7191 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7192 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7193 	assert(end >= start);
7194 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7195 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7196 	vm_map_address_t va = start;
7197 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7198 	tt_entry_t     *tte_p;
7199 	tte_p = pmap_tte(pmap, start);
7200 	unsigned int npages = 0;
7201 
7202 	if (tte_p == (tt_entry_t *) NULL) {
7203 		return end;
7204 	}
7205 
7206 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7207 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7208 
7209 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7210 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7211 		assert(end_pte_p >= start_pte_p);
7212 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7213 			if (__improbable(npages++ && pmap_pending_preemption())) {
7214 				return va;
7215 			}
7216 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7217 			if (pa_valid(pa)) {
7218 				ppnum_t pn = (ppnum_t) atop(pa);
7219 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7220 			}
7221 		}
7222 	}
7223 	return end;
7224 }
7225 
7226 MARK_AS_PMAP_TEXT vm_map_address_t
7227 phys_attribute_clear_range_internal(
7228 	pmap_t pmap,
7229 	vm_map_address_t start,
7230 	vm_map_address_t end,
7231 	unsigned int bits,
7232 	unsigned int options)
7233 {
7234 	if (__improbable(end < start)) {
7235 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7236 	}
7237 	validate_pmap_mutable(pmap);
7238 
7239 	vm_map_address_t va = start;
7240 	pmap_tlb_flush_range_t flush_range = {
7241 		.ptfr_pmap = pmap,
7242 		.ptfr_start = start,
7243 		.ptfr_end = end,
7244 		.ptfr_flush_needed = false
7245 	};
7246 
7247 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7248 		return va;
7249 	}
7250 
7251 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7252 
7253 	while (va < end) {
7254 		vm_map_address_t curr_end;
7255 
7256 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7257 		if (curr_end > end) {
7258 			curr_end = end;
7259 		}
7260 
7261 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7262 		if ((va < curr_end) || pmap_pending_preemption()) {
7263 			break;
7264 		}
7265 	}
7266 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7267 	if (flush_range.ptfr_flush_needed) {
7268 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7269 			flush_range.ptfr_start,
7270 			flush_range.ptfr_end - flush_range.ptfr_start,
7271 			flush_range.ptfr_pmap,
7272 			true,
7273 			false);
7274 		sync_tlb_flush();
7275 	}
7276 	return va;
7277 }
7278 
7279 static void
7280 phys_attribute_clear_range(
7281 	pmap_t pmap,
7282 	vm_map_address_t start,
7283 	vm_map_address_t end,
7284 	unsigned int bits,
7285 	unsigned int options)
7286 {
7287 	/*
7288 	 * We allow single-page requests to execute non-preemptibly,
7289 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7290 	 * operation, and there are a couple of special use cases that
7291 	 * require a non-preemptible single-page operation.
7292 	 */
7293 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7294 		pmap_verify_preemptible();
7295 	}
7296 
7297 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7298 
7299 	while (start < end) {
7300 #if XNU_MONITOR
7301 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7302 #else
7303 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7304 #endif
7305 	}
7306 
7307 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7308 }
7309 #endif /* __ARM_RANGE_TLBI__ */
7310 
7311 static void
7312 phys_attribute_clear(
7313 	ppnum_t         pn,
7314 	unsigned int    bits,
7315 	int             options,
7316 	void            *arg)
7317 {
7318 	/*
7319 	 * Do we really want this tracepoint?  It will be extremely chatty.
7320 	 * Also, should we have a corresponding trace point for the set path?
7321 	 */
7322 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7323 
7324 #if XNU_MONITOR
7325 	phys_attribute_clear_ppl(pn, bits, options, arg);
7326 #else
7327 	phys_attribute_clear_internal(pn, bits, options, arg);
7328 #endif
7329 
7330 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7331 }
7332 
7333 /*
7334  *	Set specified attribute bits.
7335  *
7336  *	Set cached value in the pv head because we have
7337  *	no per-mapping hardware support for referenced and
7338  *	modify bits.
7339  */
7340 MARK_AS_PMAP_TEXT void
7341 phys_attribute_set_internal(
7342 	ppnum_t pn,
7343 	unsigned int bits)
7344 {
7345 	pmap_paddr_t    pa = ptoa(pn);
7346 	assert(pn != vm_page_fictitious_addr);
7347 
7348 #if XNU_MONITOR
7349 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7350 		panic("%s: illegal request, "
7351 		    "pn=%u, bits=%#x",
7352 		    __FUNCTION__,
7353 		    pn, bits);
7354 	}
7355 #endif
7356 
7357 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7358 
7359 	return;
7360 }
7361 
7362 static void
7363 phys_attribute_set(
7364 	ppnum_t pn,
7365 	unsigned int bits)
7366 {
7367 #if XNU_MONITOR
7368 	phys_attribute_set_ppl(pn, bits);
7369 #else
7370 	phys_attribute_set_internal(pn, bits);
7371 #endif
7372 }
7373 
7374 
7375 /*
7376  *	Check specified attribute bits.
7377  *
7378  *	use the software cached bits (since no hw support).
7379  */
7380 static boolean_t
7381 phys_attribute_test(
7382 	ppnum_t pn,
7383 	unsigned int bits)
7384 {
7385 	pmap_paddr_t    pa = ptoa(pn);
7386 	assert(pn != vm_page_fictitious_addr);
7387 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7388 }
7389 
7390 
7391 /*
7392  *	Set the modify/reference bits on the specified physical page.
7393  */
7394 void
7395 pmap_set_modify(ppnum_t pn)
7396 {
7397 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7398 }
7399 
7400 
7401 /*
7402  *	Clear the modify bits on the specified physical page.
7403  */
7404 void
7405 pmap_clear_modify(
7406 	ppnum_t pn)
7407 {
7408 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7409 }
7410 
7411 
7412 /*
7413  *	pmap_is_modified:
7414  *
7415  *	Return whether or not the specified physical page is modified
7416  *	by any physical maps.
7417  */
7418 boolean_t
7419 pmap_is_modified(
7420 	ppnum_t pn)
7421 {
7422 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7423 }
7424 
7425 
7426 /*
7427  *	Set the reference bit on the specified physical page.
7428  */
7429 static void
7430 pmap_set_reference(
7431 	ppnum_t pn)
7432 {
7433 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7434 }
7435 
7436 /*
7437  *	Clear the reference bits on the specified physical page.
7438  */
7439 void
7440 pmap_clear_reference(
7441 	ppnum_t pn)
7442 {
7443 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7444 }
7445 
7446 
7447 /*
7448  *	pmap_is_referenced:
7449  *
7450  *	Return whether or not the specified physical page is referenced
7451  *	by any physical maps.
7452  */
7453 boolean_t
7454 pmap_is_referenced(
7455 	ppnum_t pn)
7456 {
7457 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7458 }
7459 
7460 /*
7461  * pmap_get_refmod(phys)
7462  *  returns the referenced and modified bits of the specified
7463  *  physical page.
7464  */
7465 unsigned int
7466 pmap_get_refmod(
7467 	ppnum_t pn)
7468 {
7469 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7470 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7471 }
7472 
7473 static inline unsigned int
7474 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7475 {
7476 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7477 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7478 }
7479 
7480 /*
7481  * pmap_clear_refmod(phys, mask)
7482  *  clears the referenced and modified bits as specified by the mask
7483  *  of the specified physical page.
7484  */
7485 void
7486 pmap_clear_refmod_options(
7487 	ppnum_t         pn,
7488 	unsigned int    mask,
7489 	unsigned int    options,
7490 	void            *arg)
7491 {
7492 	unsigned int    bits;
7493 
7494 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7495 	phys_attribute_clear(pn, bits, options, arg);
7496 }
7497 
7498 /*
7499  * Perform pmap_clear_refmod_options on a virtual address range.
7500  * The operation will be performed in bulk & tlb flushes will be coalesced
7501  * if possible.
7502  *
7503  * Returns true if the operation is supported on this platform.
7504  * If this function returns false, the operation is not supported and
7505  * nothing has been modified in the pmap.
7506  */
7507 bool
7508 pmap_clear_refmod_range_options(
7509 	pmap_t pmap __unused,
7510 	vm_map_address_t start __unused,
7511 	vm_map_address_t end __unused,
7512 	unsigned int mask __unused,
7513 	unsigned int options __unused)
7514 {
7515 #if __ARM_RANGE_TLBI__
7516 	unsigned int    bits;
7517 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7518 	phys_attribute_clear_range(pmap, start, end, bits, options);
7519 	return true;
7520 #else /* __ARM_RANGE_TLBI__ */
7521 #pragma unused(pmap, start, end, mask, options)
7522 	/*
7523 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7524 	 * contiguous range of addresses. This is large performance improvement on
7525 	 * platforms that support ranged tlbi instructions. But on older platforms,
7526 	 * we can only flush per-page or the entire asid. So we currently
7527 	 * only support this operation on platforms that support ranged tlbi.
7528 	 * instructions. On other platforms, we require that
7529 	 * the VM modify the bits on a per-page basis.
7530 	 */
7531 	return false;
7532 #endif /* __ARM_RANGE_TLBI__ */
7533 }
7534 
7535 void
7536 pmap_clear_refmod(
7537 	ppnum_t pn,
7538 	unsigned int mask)
7539 {
7540 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7541 }
7542 
7543 unsigned int
7544 pmap_disconnect_options(
7545 	ppnum_t pn,
7546 	unsigned int options,
7547 	void *arg)
7548 {
7549 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7550 		/*
7551 		 * On ARM, the "modified" bit is managed by software, so
7552 		 * we know up-front if the physical page is "modified",
7553 		 * without having to scan all the PTEs pointing to it.
7554 		 * The caller should have made the VM page "busy" so noone
7555 		 * should be able to establish any new mapping and "modify"
7556 		 * the page behind us.
7557 		 */
7558 		if (pmap_is_modified(pn)) {
7559 			/*
7560 			 * The page has been modified and will be sent to
7561 			 * the VM compressor.
7562 			 */
7563 			options |= PMAP_OPTIONS_COMPRESSOR;
7564 		} else {
7565 			/*
7566 			 * The page hasn't been modified and will be freed
7567 			 * instead of compressed.
7568 			 */
7569 		}
7570 	}
7571 
7572 	/* disconnect the page */
7573 	pmap_page_protect_options(pn, 0, options, arg);
7574 
7575 	/* return ref/chg status */
7576 	return pmap_get_refmod(pn);
7577 }
7578 
7579 /*
7580  *	Routine:
7581  *		pmap_disconnect
7582  *
7583  *	Function:
7584  *		Disconnect all mappings for this page and return reference and change status
7585  *		in generic format.
7586  *
7587  */
7588 unsigned int
7589 pmap_disconnect(
7590 	ppnum_t pn)
7591 {
7592 	pmap_page_protect(pn, 0);       /* disconnect the page */
7593 	return pmap_get_refmod(pn);   /* return ref/chg status */
7594 }
7595 
7596 boolean_t
7597 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7598 {
7599 	if (ptoa(first) >= vm_last_phys) {
7600 		return FALSE;
7601 	}
7602 	if (ptoa(last) < vm_first_phys) {
7603 		return FALSE;
7604 	}
7605 
7606 	return TRUE;
7607 }
7608 
7609 /*
7610  * The state maintained by the noencrypt functions is used as a
7611  * debugging aid on ARM.  This incurs some overhead on the part
7612  * of the caller.  A special case check in phys_attribute_clear
7613  * (the most expensive path) currently minimizes this overhead,
7614  * but stubbing these functions out on RELEASE kernels yields
7615  * further wins.
7616  */
7617 boolean_t
7618 pmap_is_noencrypt(
7619 	ppnum_t pn)
7620 {
7621 #if DEVELOPMENT || DEBUG
7622 	boolean_t result = FALSE;
7623 
7624 	if (!pa_valid(ptoa(pn))) {
7625 		return FALSE;
7626 	}
7627 
7628 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7629 
7630 	return result;
7631 #else
7632 #pragma unused(pn)
7633 	return FALSE;
7634 #endif
7635 }
7636 
7637 void
7638 pmap_set_noencrypt(
7639 	ppnum_t pn)
7640 {
7641 #if DEVELOPMENT || DEBUG
7642 	if (!pa_valid(ptoa(pn))) {
7643 		return;
7644 	}
7645 
7646 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7647 #else
7648 #pragma unused(pn)
7649 #endif
7650 }
7651 
7652 void
7653 pmap_clear_noencrypt(
7654 	ppnum_t pn)
7655 {
7656 #if DEVELOPMENT || DEBUG
7657 	if (!pa_valid(ptoa(pn))) {
7658 		return;
7659 	}
7660 
7661 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7662 #else
7663 #pragma unused(pn)
7664 #endif
7665 }
7666 
7667 #if XNU_MONITOR
7668 boolean_t
7669 pmap_is_monitor(ppnum_t pn)
7670 {
7671 	assert(pa_valid(ptoa(pn)));
7672 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7673 }
7674 #endif
7675 
7676 void
7677 pmap_lock_phys_page(ppnum_t pn)
7678 {
7679 #if !XNU_MONITOR
7680 	unsigned int    pai;
7681 	pmap_paddr_t    phys = ptoa(pn);
7682 
7683 	if (pa_valid(phys)) {
7684 		pai = pa_index(phys);
7685 		pvh_lock(pai);
7686 	} else
7687 #else
7688 	(void)pn;
7689 #endif
7690 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7691 }
7692 
7693 
7694 void
7695 pmap_unlock_phys_page(ppnum_t pn)
7696 {
7697 #if !XNU_MONITOR
7698 	unsigned int    pai;
7699 	pmap_paddr_t    phys = ptoa(pn);
7700 
7701 	if (pa_valid(phys)) {
7702 		pai = pa_index(phys);
7703 		pvh_unlock(pai);
7704 	} else
7705 #else
7706 	(void)pn;
7707 #endif
7708 	{ simple_unlock(&phys_backup_lock);}
7709 }
7710 
7711 MARK_AS_PMAP_TEXT static void
7712 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7713 {
7714 	if (pmap != kernel_pmap) {
7715 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7716 		cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7717 		    NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7718 		cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7719 		cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7720 #if __ARM_MIXED_PAGE_SIZE__
7721 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7722 #endif
7723 	}
7724 
7725 
7726 #if __ARM_MIXED_PAGE_SIZE__
7727 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7728 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7729 	}
7730 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7731 
7732 
7733 	if (pmap != kernel_pmap) {
7734 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7735 	} else if (!pmap_user_ttb_is_clear()) {
7736 		pmap_clear_user_ttb_internal();
7737 	}
7738 }
7739 
7740 MARK_AS_PMAP_TEXT void
7741 pmap_clear_user_ttb_internal(void)
7742 {
7743 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7744 }
7745 
7746 void
7747 pmap_clear_user_ttb(void)
7748 {
7749 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7750 #if XNU_MONITOR
7751 	pmap_clear_user_ttb_ppl();
7752 #else
7753 	pmap_clear_user_ttb_internal();
7754 #endif
7755 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7756 }
7757 
7758 
7759 #if defined(__arm64__)
7760 /*
7761  * Marker for use in multi-pass fast-fault PV list processing.
7762  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7763  * these functions, as compressed PTEs should never be present in PV lists.
7764  * Note that this only holds true for arm64; for arm32 we don't have enough
7765  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7766  * and WRITEABLE marker depending on whether the PTE is valid.
7767  */
7768 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7769 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7770 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7771 #endif
7772 
7773 
7774 MARK_AS_PMAP_TEXT static boolean_t
7775 arm_force_fast_fault_with_flush_range(
7776 	ppnum_t         ppnum,
7777 	vm_prot_t       allow_mode,
7778 	int             options,
7779 	pmap_tlb_flush_range_t *flush_range)
7780 {
7781 	pmap_paddr_t     phys = ptoa(ppnum);
7782 	pv_entry_t      *pve_p;
7783 	pt_entry_t      *pte_p;
7784 	unsigned int     pai;
7785 	unsigned int     pass1_updated = 0;
7786 	unsigned int     pass2_updated = 0;
7787 	boolean_t        result;
7788 	pv_entry_t     **pv_h;
7789 	bool             is_reusable;
7790 	bool             ref_fault;
7791 	bool             mod_fault;
7792 	bool             clear_write_fault = false;
7793 	bool             ref_aliases_mod = false;
7794 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7795 
7796 	assert(ppnum != vm_page_fictitious_addr);
7797 
7798 	if (!pa_valid(phys)) {
7799 		return FALSE;   /* Not a managed page. */
7800 	}
7801 
7802 	result = TRUE;
7803 	ref_fault = false;
7804 	mod_fault = false;
7805 	pai = pa_index(phys);
7806 	if (__probable(mustsynch)) {
7807 		pvh_lock(pai);
7808 	}
7809 	pv_h = pai_to_pvh(pai);
7810 
7811 #if XNU_MONITOR
7812 	if (__improbable(ppattr_pa_test_monitor(phys))) {
7813 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7814 	}
7815 #endif
7816 	pte_p = PT_ENTRY_NULL;
7817 	pve_p = PV_ENTRY_NULL;
7818 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7819 		pte_p = pvh_ptep(pv_h);
7820 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7821 		pve_p = pvh_pve_list(pv_h);
7822 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7823 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7824 	}
7825 
7826 	is_reusable = ppattr_test_reusable(pai);
7827 
7828 	/*
7829 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7830 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
7831 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7832 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
7833 	 * operation, TLB invalidation may be handled by the caller so it's possible for
7834 	 * tlb_flush_needed to be true while issue_tlbi is false.
7835 	 */
7836 	bool issue_tlbi = false;
7837 	bool tlb_flush_needed = false;
7838 
7839 	pv_entry_t *orig_pve_p = pve_p;
7840 	pt_entry_t *orig_pte_p = pte_p;
7841 	int pve_ptep_idx = 0;
7842 
7843 	/*
7844 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7845 	 * TLB invalidation in pass 2.
7846 	 */
7847 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7848 		pt_entry_t       spte;
7849 		pt_entry_t       tmplate;
7850 
7851 		if (pve_p != PV_ENTRY_NULL) {
7852 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7853 			if (pte_p == PT_ENTRY_NULL) {
7854 				goto fff_skip_pve_pass1;
7855 			}
7856 		}
7857 
7858 #ifdef PVH_FLAG_IOMMU
7859 		if (pvh_ptep_is_iommu(pte_p)) {
7860 			goto fff_skip_pve_pass1;
7861 		}
7862 #endif
7863 		if (*pte_p == ARM_PTE_EMPTY) {
7864 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7865 		}
7866 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7867 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7868 		}
7869 
7870 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7871 		const pmap_t pmap = ptdp->pmap;
7872 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7873 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7874 
7875 		assert(va >= pmap->min && va < pmap->max);
7876 
7877 		/* update pmap stats and ledgers */
7878 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7879 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7880 		if (is_altacct) {
7881 			/*
7882 			 * We do not track "reusable" status for
7883 			 * "alternate accounting" mappings.
7884 			 */
7885 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7886 		    is_reusable &&
7887 		    is_internal &&
7888 		    pmap != kernel_pmap) {
7889 			/* one less "reusable" */
7890 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7891 			/* one more "internal" */
7892 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7893 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7894 
7895 			/*
7896 			 * Since the page is being marked non-reusable, we assume that it will be
7897 			 * modified soon.  Avoid the cost of another trap to handle the fast
7898 			 * fault when we next write to this page.
7899 			 */
7900 			clear_write_fault = true;
7901 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7902 		    !is_reusable &&
7903 		    is_internal &&
7904 		    pmap != kernel_pmap) {
7905 			/* one more "reusable" */
7906 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7907 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7908 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7909 		}
7910 
7911 		bool wiredskip = pte_is_wired(*pte_p) &&
7912 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7913 
7914 		if (wiredskip) {
7915 			result = FALSE;
7916 			goto fff_skip_pve_pass1;
7917 		}
7918 
7919 		spte = *pte_p;
7920 		tmplate = spte;
7921 
7922 #if HAS_FEAT_XS
7923 		/**
7924 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
7925 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
7926 		 */
7927 		assert(!pte_is_xs(pt_attr, spte));
7928 #endif /* HAS_FEAT_XS */
7929 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7930 			/* read protection sets the pte to fault */
7931 			tmplate =  tmplate & ~ARM_PTE_AF;
7932 			ref_fault = true;
7933 		}
7934 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7935 			/* take away write permission if set */
7936 			if (pmap == kernel_pmap) {
7937 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7938 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7939 					pte_set_was_writeable(tmplate, true);
7940 					mod_fault = true;
7941 				}
7942 			} else {
7943 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7944 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7945 					pte_set_was_writeable(tmplate, true);
7946 					mod_fault = true;
7947 				}
7948 			}
7949 		}
7950 
7951 #if MACH_ASSERT && XNU_MONITOR
7952 		if (is_pte_xprr_protected(pmap, spte)) {
7953 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7954 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7955 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7956 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7957 				    ppnum, options, allow_mode);
7958 			}
7959 		}
7960 #endif /* MACH_ASSERT && XNU_MONITOR */
7961 
7962 		if (result && (tmplate != spte)) {
7963 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7964 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
7965 				tlb_flush_needed = true;
7966 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7967 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7968 #ifdef ARM_PTE_FF_MARKER
7969 					assert(!(spte & ARM_PTE_FF_MARKER));
7970 					tmplate |= ARM_PTE_FF_MARKER;
7971 					++pass1_updated;
7972 #endif
7973 					issue_tlbi = true;
7974 				}
7975 			}
7976 			write_pte_fast(pte_p, tmplate);
7977 		}
7978 
7979 fff_skip_pve_pass1:
7980 		pte_p = PT_ENTRY_NULL;
7981 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7982 			pve_ptep_idx = 0;
7983 			pve_p = pve_next(pve_p);
7984 		}
7985 	}
7986 
7987 	if (tlb_flush_needed) {
7988 		FLUSH_PTE_STRONG();
7989 	}
7990 
7991 	if (!issue_tlbi) {
7992 		goto fff_finish;
7993 	}
7994 
7995 	/* Pass 2: Issue any required TLB invalidations */
7996 	pve_p = orig_pve_p;
7997 	pte_p = orig_pte_p;
7998 	pve_ptep_idx = 0;
7999 
8000 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8001 		if (pve_p != PV_ENTRY_NULL) {
8002 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8003 			if (pte_p == PT_ENTRY_NULL) {
8004 				goto fff_skip_pve_pass2;
8005 			}
8006 		}
8007 
8008 #ifdef PVH_FLAG_IOMMU
8009 		if (pvh_ptep_is_iommu(pte_p)) {
8010 			goto fff_skip_pve_pass2;
8011 		}
8012 #endif
8013 
8014 #ifdef ARM_PTE_FF_MARKER
8015 		pt_entry_t spte = *pte_p;
8016 
8017 		if (!(spte & ARM_PTE_FF_MARKER)) {
8018 			goto fff_skip_pve_pass2;
8019 		} else {
8020 			spte &= (~ARM_PTE_FF_MARKER);
8021 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8022 			write_pte_fast(pte_p, spte);
8023 			++pass2_updated;
8024 		}
8025 #endif
8026 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8027 		const pmap_t pmap = ptdp->pmap;
8028 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8029 
8030 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8031 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8032 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8033 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8034 		}
8035 
8036 fff_skip_pve_pass2:
8037 		pte_p = PT_ENTRY_NULL;
8038 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8039 			pve_ptep_idx = 0;
8040 			pve_p = pve_next(pve_p);
8041 		}
8042 	}
8043 
8044 fff_finish:
8045 	if (__improbable(pass1_updated != pass2_updated)) {
8046 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8047 		    __func__, pass1_updated, pass2_updated);
8048 	}
8049 
8050 	/*
8051 	 * If we are using the same approach for ref and mod
8052 	 * faults on this PTE, do not clear the write fault;
8053 	 * this would cause both ref and mod to be set on the
8054 	 * page again, and prevent us from taking ANY read/write
8055 	 * fault on the mapping.
8056 	 */
8057 	if (clear_write_fault && !ref_aliases_mod) {
8058 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8059 	}
8060 	if (tlb_flush_needed) {
8061 		if (flush_range) {
8062 			/* Delayed flush. Signal to the caller that the flush is needed. */
8063 			flush_range->ptfr_flush_needed = true;
8064 		} else {
8065 			sync_tlb_flush();
8066 		}
8067 	}
8068 
8069 	/* update global "reusable" status for this page */
8070 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8071 		ppattr_clear_reusable(pai);
8072 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8073 		ppattr_set_reusable(pai);
8074 	}
8075 
8076 	if (mod_fault) {
8077 		ppattr_set_modfault(pai);
8078 	}
8079 	if (ref_fault) {
8080 		ppattr_set_reffault(pai);
8081 	}
8082 	if (__probable(mustsynch)) {
8083 		pvh_unlock(pai);
8084 	}
8085 	return result;
8086 }
8087 
8088 MARK_AS_PMAP_TEXT boolean_t
8089 arm_force_fast_fault_internal(
8090 	ppnum_t         ppnum,
8091 	vm_prot_t       allow_mode,
8092 	int             options)
8093 {
8094 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8095 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8096 	}
8097 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8098 }
8099 
8100 /*
8101  *	Routine:	arm_force_fast_fault
8102  *
8103  *	Function:
8104  *		Force all mappings for this page to fault according
8105  *		to the access modes allowed, so we can gather ref/modify
8106  *		bits again.
8107  */
8108 
8109 boolean_t
8110 arm_force_fast_fault(
8111 	ppnum_t         ppnum,
8112 	vm_prot_t       allow_mode,
8113 	int             options,
8114 	__unused void   *arg)
8115 {
8116 	pmap_paddr_t    phys = ptoa(ppnum);
8117 
8118 	assert(ppnum != vm_page_fictitious_addr);
8119 
8120 	if (!pa_valid(phys)) {
8121 		return FALSE;   /* Not a managed page. */
8122 	}
8123 
8124 #if XNU_MONITOR
8125 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8126 #else
8127 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8128 #endif
8129 }
8130 
8131 /*
8132  *	Routine:	arm_clear_fast_fault
8133  *
8134  *	Function:
8135  *		Clear pending force fault for all mappings for this page based on
8136  *		the observed fault type, update ref/modify bits.
8137  */
8138 MARK_AS_PMAP_TEXT static boolean_t
8139 arm_clear_fast_fault(
8140 	ppnum_t ppnum,
8141 	vm_prot_t fault_type,
8142 	pt_entry_t *pte_p)
8143 {
8144 	pmap_paddr_t    pa = ptoa(ppnum);
8145 	pv_entry_t     *pve_p;
8146 	unsigned int    pai;
8147 	boolean_t       result;
8148 	bool            tlb_flush_needed = false;
8149 	pv_entry_t    **pv_h;
8150 	unsigned int    npve = 0;
8151 	unsigned int    pass1_updated = 0;
8152 	unsigned int    pass2_updated = 0;
8153 
8154 	assert(ppnum != vm_page_fictitious_addr);
8155 
8156 	if (!pa_valid(pa)) {
8157 		return FALSE;   /* Not a managed page. */
8158 	}
8159 
8160 	result = FALSE;
8161 	pai = pa_index(pa);
8162 	pvh_assert_locked(pai);
8163 	pv_h = pai_to_pvh(pai);
8164 
8165 	pve_p = PV_ENTRY_NULL;
8166 	if (pte_p == PT_ENTRY_NULL) {
8167 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8168 			pte_p = pvh_ptep(pv_h);
8169 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8170 			pve_p = pvh_pve_list(pv_h);
8171 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8172 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8173 		}
8174 	}
8175 
8176 	pv_entry_t *orig_pve_p = pve_p;
8177 	pt_entry_t *orig_pte_p = pte_p;
8178 	int pve_ptep_idx = 0;
8179 
8180 	/*
8181 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8182 	 * TLB invalidation in pass 2.
8183 	 */
8184 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8185 		pt_entry_t spte;
8186 		pt_entry_t tmplate;
8187 
8188 		if (pve_p != PV_ENTRY_NULL) {
8189 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8190 			if (pte_p == PT_ENTRY_NULL) {
8191 				goto cff_skip_pve_pass1;
8192 			}
8193 		}
8194 
8195 #ifdef PVH_FLAG_IOMMU
8196 		if (pvh_ptep_is_iommu(pte_p)) {
8197 			goto cff_skip_pve_pass1;
8198 		}
8199 #endif
8200 		if (*pte_p == ARM_PTE_EMPTY) {
8201 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8202 		}
8203 
8204 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8205 		const pmap_t pmap = ptdp->pmap;
8206 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8207 
8208 		assert(va >= pmap->min && va < pmap->max);
8209 
8210 		spte = *pte_p;
8211 		tmplate = spte;
8212 
8213 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8214 			{
8215 				if (pmap == kernel_pmap) {
8216 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8217 				} else {
8218 					assert(pmap->type != PMAP_TYPE_NESTED);
8219 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8220 				}
8221 			}
8222 
8223 			tmplate |= ARM_PTE_AF;
8224 
8225 			pte_set_was_writeable(tmplate, false);
8226 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8227 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8228 			tmplate = spte | ARM_PTE_AF;
8229 
8230 			{
8231 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8232 			}
8233 		}
8234 
8235 #if MACH_ASSERT && XNU_MONITOR
8236 		if (is_pte_xprr_protected(pmap, spte)) {
8237 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8238 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8239 				    "ppnum=0x%x, fault_type=0x%x",
8240 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8241 				    ppnum, fault_type);
8242 			}
8243 		}
8244 #endif /* MACH_ASSERT && XNU_MONITOR */
8245 
8246 		assert(spte != ARM_PTE_TYPE_FAULT);
8247 		if (spte != tmplate) {
8248 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8249 #ifdef ARM_PTE_FF_MARKER
8250 				assert(!(spte & ARM_PTE_FF_MARKER));
8251 				tmplate |= ARM_PTE_FF_MARKER;
8252 				++pass1_updated;
8253 #endif
8254 				tlb_flush_needed = true;
8255 			}
8256 			write_pte_fast(pte_p, tmplate);
8257 			result = TRUE;
8258 		}
8259 
8260 cff_skip_pve_pass1:
8261 		pte_p = PT_ENTRY_NULL;
8262 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8263 			pve_ptep_idx = 0;
8264 			pve_p = pve_next(pve_p);
8265 			++npve;
8266 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8267 				break;
8268 			}
8269 		}
8270 	}
8271 
8272 	if (!tlb_flush_needed) {
8273 		goto cff_finish;
8274 	}
8275 
8276 	FLUSH_PTE_STRONG();
8277 
8278 	/* Pass 2: Issue any required TLB invalidations */
8279 	pve_p = orig_pve_p;
8280 	pte_p = orig_pte_p;
8281 	pve_ptep_idx = 0;
8282 	npve = 0;
8283 
8284 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8285 		if (pve_p != PV_ENTRY_NULL) {
8286 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8287 			if (pte_p == PT_ENTRY_NULL) {
8288 				goto cff_skip_pve_pass2;
8289 			}
8290 		}
8291 
8292 #ifdef PVH_FLAG_IOMMU
8293 		if (pvh_ptep_is_iommu(pte_p)) {
8294 			goto cff_skip_pve_pass2;
8295 		}
8296 #endif
8297 
8298 #ifdef ARM_PTE_FF_MARKER
8299 		pt_entry_t spte = *pte_p;
8300 
8301 		if (!(spte & ARM_PTE_FF_MARKER)) {
8302 			goto cff_skip_pve_pass2;
8303 		} else {
8304 			spte &= (~ARM_PTE_FF_MARKER);
8305 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8306 			write_pte_fast(pte_p, spte);
8307 			++pass2_updated;
8308 		}
8309 #endif
8310 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8311 		const pmap_t pmap = ptdp->pmap;
8312 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8313 
8314 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8315 		    pmap, true, false);
8316 
8317 cff_skip_pve_pass2:
8318 		pte_p = PT_ENTRY_NULL;
8319 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8320 			pve_ptep_idx = 0;
8321 			pve_p = pve_next(pve_p);
8322 			++npve;
8323 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8324 				break;
8325 			}
8326 		}
8327 	}
8328 
8329 cff_finish:
8330 	if (__improbable(pass1_updated != pass2_updated)) {
8331 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8332 		    __func__, pass1_updated, pass2_updated);
8333 	}
8334 	if (tlb_flush_needed) {
8335 		sync_tlb_flush();
8336 	}
8337 	return result;
8338 }
8339 
8340 /*
8341  * Determine if the fault was induced by software tracking of
8342  * modify/reference bits.  If so, re-enable the mapping (and set
8343  * the appropriate bits).
8344  *
8345  * Returns KERN_SUCCESS if the fault was induced and was
8346  * successfully handled.
8347  *
8348  * Returns KERN_FAILURE if the fault was not induced and
8349  * the function was unable to deal with it.
8350  *
8351  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8352  * disallows this type of access.
8353  *
8354  * Returns KERN_ABORTED if the pmap lock is taken and a
8355  * preemption is pending.
8356  *
8357  */
8358 MARK_AS_PMAP_TEXT kern_return_t
8359 arm_fast_fault_internal(
8360 	pmap_t pmap,
8361 	vm_map_address_t va,
8362 	vm_prot_t fault_type,
8363 	__unused bool was_af_fault,
8364 	__unused bool from_user)
8365 {
8366 	kern_return_t   result = KERN_FAILURE;
8367 	pt_entry_t     *ptep;
8368 	pt_entry_t      spte = ARM_PTE_TYPE_FAULT;
8369 	unsigned int    pai;
8370 	pmap_paddr_t    pa;
8371 	validate_pmap_mutable(pmap);
8372 
8373 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8374 		return KERN_ABORTED;
8375 	}
8376 
8377 	/*
8378 	 * If the entry doesn't exist, is completely invalid, or is already
8379 	 * valid, we can't fix it here.
8380 	 */
8381 
8382 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8383 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8384 	if (ptep != PT_ENTRY_NULL) {
8385 		while (true) {
8386 			spte = *((volatile pt_entry_t*)ptep);
8387 
8388 			pa = pte_to_pa(spte);
8389 
8390 			if ((spte == ARM_PTE_TYPE_FAULT) ||
8391 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8392 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8393 				return result;
8394 			}
8395 
8396 			if (!pa_valid(pa)) {
8397 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8398 #if XNU_MONITOR
8399 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8400 					return KERN_PROTECTION_FAILURE;
8401 				} else
8402 #endif
8403 				return result;
8404 			}
8405 			pai = pa_index(pa);
8406 			pvh_lock(pai);
8407 			if (*ptep == spte) {
8408 				/*
8409 				 * Double-check the spte value, as we care about the AF bit.
8410 				 * It's also possible that pmap_page_protect() transitioned the
8411 				 * PTE to compressed/empty before we grabbed the PVH lock.
8412 				 */
8413 				break;
8414 			}
8415 			pvh_unlock(pai);
8416 		}
8417 	} else {
8418 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8419 		return result;
8420 	}
8421 
8422 
8423 	if ((result != KERN_SUCCESS) &&
8424 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8425 		/*
8426 		 * An attempted access will always clear ref/mod fault state, as
8427 		 * appropriate for the fault type.  arm_clear_fast_fault will
8428 		 * update the associated PTEs for the page as appropriate; if
8429 		 * any PTEs are updated, we redrive the access.  If the mapping
8430 		 * does not actually allow for the attempted access, the
8431 		 * following fault will (hopefully) fail to update any PTEs, and
8432 		 * thus cause arm_fast_fault to decide that it failed to handle
8433 		 * the fault.
8434 		 */
8435 		if (ppattr_test_reffault(pai)) {
8436 			ppattr_clear_reffault(pai);
8437 		}
8438 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8439 			ppattr_clear_modfault(pai);
8440 		}
8441 
8442 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8443 			/*
8444 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8445 			 * cost of not doing so is a another fault in a case
8446 			 * that should already result in an exception.
8447 			 */
8448 			result = KERN_SUCCESS;
8449 		}
8450 	}
8451 
8452 	/*
8453 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8454 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8455 	 * on mappings of the same page
8456 	 */
8457 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8458 		uintptr_t ap_ro, ap_rw, ap_x;
8459 		if (pmap == kernel_pmap) {
8460 			ap_ro = ARM_PTE_AP(AP_RONA);
8461 			ap_rw = ARM_PTE_AP(AP_RWNA);
8462 			ap_x = ARM_PTE_NX;
8463 		} else {
8464 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8465 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8466 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8467 		}
8468 		/*
8469 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8470 		 * hardware they may be xPRR-protected, in which case they'll be handled
8471 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8472 		 * handling path currently does not call arm_fast_fault() without at least
8473 		 * VM_PROT_READ in fault_type.
8474 		 */
8475 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8476 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8477 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8478 				result = KERN_SUCCESS;
8479 			}
8480 		}
8481 	}
8482 
8483 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8484 		/*
8485 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8486 		 * another pending PV list operation or an excessively large PV list.
8487 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8488 		 * taking a fault on the same mapping.
8489 		 */
8490 		result = KERN_SUCCESS;
8491 	}
8492 
8493 	pvh_unlock(pai);
8494 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8495 	return result;
8496 }
8497 
8498 kern_return_t
8499 arm_fast_fault(
8500 	pmap_t pmap,
8501 	vm_map_address_t va,
8502 	vm_prot_t fault_type,
8503 	bool was_af_fault,
8504 	__unused bool from_user)
8505 {
8506 	kern_return_t   result = KERN_FAILURE;
8507 
8508 	if (va < pmap->min || va >= pmap->max) {
8509 		return result;
8510 	}
8511 
8512 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8513 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8514 	    from_user);
8515 
8516 	do {
8517 #if XNU_MONITOR
8518 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8519 #else
8520 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8521 #endif
8522 	} while (result == KERN_ABORTED);
8523 
8524 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8525 
8526 	return result;
8527 }
8528 
8529 void
8530 pmap_copy_page(
8531 	ppnum_t psrc,
8532 	ppnum_t pdst)
8533 {
8534 	bcopy_phys((addr64_t) (ptoa(psrc)),
8535 	    (addr64_t) (ptoa(pdst)),
8536 	    PAGE_SIZE);
8537 }
8538 
8539 
8540 /*
8541  *	pmap_copy_page copies the specified (machine independent) pages.
8542  */
8543 void
8544 pmap_copy_part_page(
8545 	ppnum_t psrc,
8546 	vm_offset_t src_offset,
8547 	ppnum_t pdst,
8548 	vm_offset_t dst_offset,
8549 	vm_size_t len)
8550 {
8551 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8552 	    (addr64_t) (ptoa(pdst) + dst_offset),
8553 	    len);
8554 }
8555 
8556 
8557 /*
8558  *	pmap_zero_page zeros the specified (machine independent) page.
8559  */
8560 void
8561 pmap_zero_page(
8562 	ppnum_t pn)
8563 {
8564 	assert(pn != vm_page_fictitious_addr);
8565 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8566 }
8567 
8568 /*
8569  *	pmap_zero_part_page
8570  *	zeros the specified (machine independent) part of a page.
8571  */
8572 void
8573 pmap_zero_part_page(
8574 	ppnum_t pn,
8575 	vm_offset_t offset,
8576 	vm_size_t len)
8577 {
8578 	assert(pn != vm_page_fictitious_addr);
8579 	assert(offset + len <= PAGE_SIZE);
8580 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8581 }
8582 
8583 void
8584 pmap_map_globals(
8585 	void)
8586 {
8587 	pt_entry_t      *ptep, pte;
8588 
8589 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8590 	assert(ptep != PT_ENTRY_NULL);
8591 	assert(*ptep == ARM_PTE_EMPTY);
8592 
8593 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8594 #if __ARM_KERNEL_PROTECT__
8595 	pte |= ARM_PTE_NG;
8596 #endif /* __ARM_KERNEL_PROTECT__ */
8597 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8598 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8599 	*ptep = pte;
8600 	FLUSH_PTE();
8601 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8602 
8603 #if KASAN
8604 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8605 #endif
8606 }
8607 
8608 vm_offset_t
8609 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8610 {
8611 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8612 		panic("%s: invalid index %u", __func__, index);
8613 	}
8614 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8615 }
8616 
8617 MARK_AS_PMAP_TEXT unsigned int
8618 pmap_map_cpu_windows_copy_internal(
8619 	ppnum_t pn,
8620 	vm_prot_t prot,
8621 	unsigned int wimg_bits)
8622 {
8623 	pt_entry_t      *ptep = NULL, pte;
8624 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8625 	unsigned int    cpu_num;
8626 	unsigned int    i;
8627 	vm_offset_t     cpu_copywindow_vaddr = 0;
8628 	bool            need_strong_sync = false;
8629 
8630 #if XNU_MONITOR
8631 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8632 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8633 #endif
8634 
8635 #if XNU_MONITOR
8636 #ifdef  __ARM_COHERENT_IO__
8637 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8638 		panic("%s: attempted to map a managed page, "
8639 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8640 		    __FUNCTION__,
8641 		    pn, prot, wimg_bits);
8642 	}
8643 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8644 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8645 	}
8646 
8647 #else /* __ARM_COHERENT_IO__ */
8648 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8649 #endif /* __ARM_COHERENT_IO__ */
8650 #endif /* XNU_MONITOR */
8651 	cpu_num = pmap_cpu_data->cpu_number;
8652 
8653 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8654 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8655 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8656 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8657 		if (*ptep == ARM_PTE_TYPE_FAULT) {
8658 			break;
8659 		}
8660 	}
8661 	if (i == CPUWINDOWS_MAX) {
8662 		panic("pmap_map_cpu_windows_copy: out of window");
8663 	}
8664 
8665 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8666 #if __ARM_KERNEL_PROTECT__
8667 	pte |= ARM_PTE_NG;
8668 #endif /* __ARM_KERNEL_PROTECT__ */
8669 
8670 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8671 
8672 	if (prot & VM_PROT_WRITE) {
8673 		pte |= ARM_PTE_AP(AP_RWNA);
8674 	} else {
8675 		pte |= ARM_PTE_AP(AP_RONA);
8676 	}
8677 #if HAS_FEAT_XS
8678 	need_strong_sync = pte_is_xs(native_pt_attr, pte);
8679 #endif
8680 	write_pte_fast(ptep, pte);
8681 	/*
8682 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8683 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8684 	 */
8685 	FLUSH_PTE_STRONG();
8686 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8687 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8688 
8689 	return i;
8690 }
8691 
8692 unsigned int
8693 pmap_map_cpu_windows_copy(
8694 	ppnum_t pn,
8695 	vm_prot_t prot,
8696 	unsigned int wimg_bits)
8697 {
8698 #if XNU_MONITOR
8699 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8700 #else
8701 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8702 #endif
8703 }
8704 
8705 MARK_AS_PMAP_TEXT void
8706 pmap_unmap_cpu_windows_copy_internal(
8707 	unsigned int index)
8708 {
8709 	pt_entry_t      *ptep;
8710 	unsigned int    cpu_num;
8711 	vm_offset_t     cpu_copywindow_vaddr = 0;
8712 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8713 
8714 	cpu_num = pmap_cpu_data->cpu_number;
8715 
8716 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8717 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8718 	 * (which are likely to have been on I/O memory) are complete before
8719 	 * tearing down the mapping. */
8720 	__builtin_arm_dsb(DSB_SY);
8721 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8722 	write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8723 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8724 }
8725 
8726 void
8727 pmap_unmap_cpu_windows_copy(
8728 	unsigned int index)
8729 {
8730 #if XNU_MONITOR
8731 	return pmap_unmap_cpu_windows_copy_ppl(index);
8732 #else
8733 	return pmap_unmap_cpu_windows_copy_internal(index);
8734 #endif
8735 }
8736 
8737 #if XNU_MONITOR
8738 
8739 MARK_AS_PMAP_TEXT void
8740 pmap_invoke_with_page(
8741 	ppnum_t page_number,
8742 	void *ctx,
8743 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8744 {
8745 	#pragma unused(page_number, ctx, callback)
8746 }
8747 
8748 /*
8749  * Loop over every pmap_io_range (I/O ranges marked as owned by
8750  * the PPL in the device tree) and conditionally call callback() on each range
8751  * that needs to be included in the hibernation image.
8752  *
8753  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8754  *                 context is needed in the callback.
8755  * @param callback Callback function invoked on each range (gated by flag).
8756  */
8757 MARK_AS_PMAP_TEXT void
8758 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8759 {
8760 	extern const pmap_io_range_t* io_attr_table;
8761 	extern const unsigned int num_io_rgns;
8762 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8763 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8764 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8765 		}
8766 	}
8767 }
8768 
8769 /**
8770  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8771  * PPL-owned page. Otherwise, do nothing.
8772  *
8773  * @param addr Physical address of the page to set the HASHED flag on.
8774  */
8775 MARK_AS_PMAP_TEXT void
8776 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8777 {
8778 	/* Ignore non-managed kernel memory. */
8779 	if (!pa_valid(addr)) {
8780 		return;
8781 	}
8782 
8783 	const unsigned int pai = pa_index(addr);
8784 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8785 		pv_entry_t **pv_h = pai_to_pvh(pai);
8786 
8787 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
8788 		pvh_lock(pai);
8789 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8790 		pvh_unlock(pai);
8791 	}
8792 }
8793 
8794 /**
8795  * Loop through every physical page in the system and clear out the HASHED flag
8796  * on every PPL-owned page. That flag is used to keep track of which pages have
8797  * been hashed into the hibernation image during the hibernation entry process.
8798  *
8799  * The HASHED flag needs to be cleared out between hibernation cycles because the
8800  * pv_head_table and pp_attr_table's might have been copied into the hibernation
8801  * image with the HASHED flag set on certain pages. It's important to clear the
8802  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8803  * into the hibernation image can't be compromised across hibernation cycles.
8804  */
8805 MARK_AS_PMAP_TEXT void
8806 pmap_clear_ppl_hashed_flag_all(void)
8807 {
8808 	const unsigned int last_index = pa_index(vm_last_phys);
8809 	pv_entry_t **pv_h = NULL;
8810 
8811 	for (int pai = 0; pai < last_index; ++pai) {
8812 		pv_h = pai_to_pvh(pai);
8813 
8814 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8815 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8816 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8817 			pvh_lock(pai);
8818 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8819 			pvh_unlock(pai);
8820 		}
8821 	}
8822 }
8823 
8824 /**
8825  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8826  * ppl_hib driver will call this after all wired pages have been copied into the
8827  * hibernation image.
8828  */
8829 MARK_AS_PMAP_TEXT void
8830 pmap_check_ppl_hashed_flag_all(void)
8831 {
8832 	const unsigned int last_index = pa_index(vm_last_phys);
8833 	pv_entry_t **pv_h = NULL;
8834 
8835 	for (int pai = 0; pai < last_index; ++pai) {
8836 		pv_h = pai_to_pvh(pai);
8837 
8838 		/**
8839 		 * The PMAP stacks are explicitly not saved into the image so skip checking
8840 		 * the pages that contain the PMAP stacks.
8841 		 */
8842 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8843 		    (pai < pa_index(pmap_stacks_end_pa));
8844 
8845 		if (!is_pmap_stack &&
8846 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8847 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8848 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8849 		}
8850 	}
8851 }
8852 
8853 #endif /* XNU_MONITOR */
8854 
8855 /*
8856  * Indicate that a pmap is intended to be used as a nested pmap
8857  * within one or more larger address spaces.  This must be set
8858  * before pmap_nest() is called with this pmap as the 'subordinate'.
8859  */
8860 MARK_AS_PMAP_TEXT void
8861 pmap_set_nested_internal(
8862 	pmap_t pmap)
8863 {
8864 	validate_pmap_mutable(pmap);
8865 	if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
8866 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8867 		    __func__, pmap, pmap->type);
8868 	}
8869 
8870 	/**
8871 	 * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give
8872 	 * this pmap its own nested pmap.
8873 	 */
8874 	if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
8875 		panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
8876 	}
8877 
8878 	pmap_get_pt_ops(pmap)->free_id(pmap);
8879 }
8880 
8881 void
8882 pmap_set_nested(
8883 	pmap_t pmap)
8884 {
8885 #if XNU_MONITOR
8886 	pmap_set_nested_ppl(pmap);
8887 #else
8888 	pmap_set_nested_internal(pmap);
8889 #endif
8890 }
8891 
8892 bool
8893 pmap_is_nested(
8894 	pmap_t pmap)
8895 {
8896 	return pmap->type == PMAP_TYPE_NESTED;
8897 }
8898 
8899 /*
8900  * pmap_trim_range(pmap, start, end)
8901  *
8902  * pmap  = pmap to operate on
8903  * start = start of the range
8904  * end   = end of the range
8905  *
8906  * Attempts to deallocate TTEs for the given range in the nested range.
8907  */
8908 MARK_AS_PMAP_TEXT static void
8909 pmap_trim_range(
8910 	pmap_t pmap,
8911 	addr64_t start,
8912 	addr64_t end)
8913 {
8914 	addr64_t cur;
8915 	addr64_t nested_region_start;
8916 	addr64_t nested_region_end;
8917 	addr64_t adjusted_start;
8918 	addr64_t adjusted_end;
8919 	addr64_t adjust_offmask;
8920 	tt_entry_t * tte_p;
8921 	pt_entry_t * pte_p;
8922 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8923 
8924 	if (__improbable(end < start)) {
8925 		panic("%s: invalid address range, "
8926 		    "pmap=%p, start=%p, end=%p",
8927 		    __func__,
8928 		    pmap, (void*)start, (void*)end);
8929 	}
8930 
8931 	nested_region_start = pmap->nested_region_addr;
8932 	nested_region_end = nested_region_start + pmap->nested_region_size;
8933 
8934 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8935 		panic("%s: range outside nested region %p-%p, "
8936 		    "pmap=%p, start=%p, end=%p",
8937 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
8938 		    pmap, (void*)start, (void*)end);
8939 	}
8940 
8941 	/* Contract the range to TT page boundaries. */
8942 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8943 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8944 	adjusted_end = end & ~adjust_offmask;
8945 
8946 	/* Iterate over the range, trying to remove TTEs. */
8947 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
8948 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8949 
8950 		tte_p = pmap_tte(pmap, cur);
8951 
8952 		if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
8953 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
8954 
8955 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
8956 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
8957 				/* Deallocate for the nested map. */
8958 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8959 			} else if (pmap->type == PMAP_TYPE_USER) {
8960 				/**
8961 				 * Just remove for the parent map. If the leaf table pointed
8962 				 * to by the TTE being removed (owned by the nested pmap)
8963 				 * has any mappings, then this call will panic. This
8964 				 * enforces the policy that tables being trimmed must be
8965 				 * empty to prevent possible use-after-free attacks.
8966 				 */
8967 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8968 			} else {
8969 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8970 			}
8971 		} else {
8972 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8973 		}
8974 	}
8975 
8976 	/* Remove empty L2 TTs. */
8977 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8978 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
8979 
8980 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
8981 		/* For each L1 entry in our range... */
8982 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8983 
8984 		bool remove_tt1e = true;
8985 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
8986 		tt_entry_t * tt2e_start;
8987 		tt_entry_t * tt2e_end;
8988 		tt_entry_t * tt2e_p;
8989 		tt_entry_t tt1e;
8990 
8991 		if (tt1e_p == NULL) {
8992 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8993 			continue;
8994 		}
8995 
8996 		tt1e = *tt1e_p;
8997 
8998 		if (tt1e == ARM_TTE_TYPE_FAULT) {
8999 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9000 			continue;
9001 		}
9002 
9003 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9004 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9005 
9006 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9007 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9008 				/*
9009 				 * If any TTEs are populated, don't remove the
9010 				 * L1 TT.
9011 				 */
9012 				remove_tt1e = false;
9013 			}
9014 		}
9015 
9016 		if (remove_tt1e) {
9017 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9018 		} else {
9019 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9020 		}
9021 	}
9022 }
9023 
9024 /**
9025  * State machine for multi-step pmap trimming. Trimming is the action of
9026  * deallocating the TTEs of the shared region of pmaps down to a given range.
9027  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9028  * disabling preemption for too long. These steps include computing the bounds
9029  * of the shared region, trimming the head of the "grand", trimming the tail of
9030  * the "grand", and trimming the "subord". Some of the steps can be skipped under
9031  * different conditions.
9032  *
9033  * @param grand the pmap in which the pages are nested
9034  * @param subord the pmap from which the pages are shared, or nested
9035  * @param vstart start of the used range in "grand"
9036  * @param size size of the used range
9037  * @param state the current state of the state machine
9038  *
9039  * @return the next state of the state machine, to be used in the next call
9040  *         into this function.
9041  */
9042 MARK_AS_PMAP_TEXT pmap_trim_state_t
9043 pmap_trim_internal(
9044 	pmap_t grand,
9045 	pmap_t subord,
9046 	addr64_t vstart,
9047 	uint64_t size,
9048 	pmap_trim_state_t state)
9049 {
9050 	/* Validation needs to be done regardless of state. */
9051 	addr64_t vend;
9052 
9053 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9054 		panic("%s: grand addr wraps around, "
9055 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9056 		    __func__, grand, subord, (void*)vstart, size, state);
9057 	}
9058 
9059 	validate_pmap_mutable(grand);
9060 	validate_pmap(subord);
9061 
9062 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9063 		panic("%s: subord is of non-nestable type 0x%hhx, "
9064 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9065 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
9066 	}
9067 
9068 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9069 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9070 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9071 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
9072 	}
9073 
9074 	if (__improbable(grand->nested_pmap != subord)) {
9075 		panic("%s: grand->nested != subord, "
9076 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9077 		    __func__, grand, subord, (void*)vstart, size, state);
9078 	}
9079 
9080 	if (__improbable((size != 0) &&
9081 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9082 		panic("%s: grand range not in nested region, "
9083 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9084 		    __func__, grand, subord, (void*)vstart, size, state);
9085 	}
9086 
9087 	/* Trimming starts with figuring out the bounds for the grand. */
9088 	if (state == PMAP_TRIM_STATE_START) {
9089 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9090 
9091 		/**
9092 		 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9093 		 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9094 		 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9095 		 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9096 		 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9097 		 * PMAP_TRIM_STATE_DONE.
9098 		 */
9099 		if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9100 			assert(subord->nested_bounds_set);
9101 
9102 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9103 			if (!grand->nested_bounds_set) {
9104 				/* Inherit the bounds from subord. */
9105 				grand->nested_region_true_start = subord->nested_region_true_start;
9106 				grand->nested_region_true_end = subord->nested_region_true_end;
9107 				grand->nested_bounds_set = true;
9108 			}
9109 
9110 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9111 
9112 			/* Now that the grand has bounds, we are done. */
9113 			return PMAP_TRIM_STATE_DONE;
9114 		}
9115 
9116 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9117 		if ((!subord->nested_bounds_set) && size) {
9118 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9119 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9120 
9121 			subord->nested_region_true_start = vstart;
9122 			subord->nested_region_true_end = vend;
9123 			subord->nested_region_true_start &= ~adjust_offmask;
9124 
9125 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9126 				panic("%s: padded true end wraps around, "
9127 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9128 				    __func__, grand, subord, (void*)vstart, size, state);
9129 			}
9130 
9131 			subord->nested_region_true_end &= ~adjust_offmask;
9132 			subord->nested_bounds_set = true;
9133 		}
9134 
9135 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9136 		if (subord->nested_bounds_set) {
9137 			/* Inherit the bounds from subord. */
9138 			grand->nested_region_true_start = subord->nested_region_true_start;
9139 			grand->nested_region_true_end = subord->nested_region_true_end;
9140 			grand->nested_bounds_set = true;
9141 
9142 			/* If we know the bounds, we can trim the pmap. */
9143 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9144 
9145 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
9146 		} else {
9147 			/* Don't trim if we don't know the bounds. */
9148 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9149 
9150 			return PMAP_TRIM_STATE_DONE;
9151 		}
9152 	}
9153 
9154 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
9155 	if (!grand->nested_bounds_set) {
9156 		panic("%s: !grand->nested_bounds_set, "
9157 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9158 		    __func__, grand, subord, (void*)vstart, size, state);
9159 	}
9160 
9161 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9162 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9163 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9164 		    NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9165 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9166 			    (unsigned int)grand->nested_no_bounds_ref_state);
9167 		}
9168 
9169 #if XNU_MONITOR
9170 		if (pmap_pending_preemption()) {
9171 			return PMAP_TRIM_STATE_GRAND_AFTER;
9172 		}
9173 #endif
9174 
9175 		state = PMAP_TRIM_STATE_GRAND_AFTER;
9176 	}
9177 
9178 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9179 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9180 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9181 		    NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9182 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9183 			    (unsigned int)grand->nested_no_bounds_ref_state);
9184 		}
9185 
9186 #if XNU_MONITOR
9187 		if (pmap_pending_preemption()) {
9188 			return PMAP_TRIM_STATE_SUBORD;
9189 		}
9190 #endif
9191 
9192 		state = PMAP_TRIM_STATE_SUBORD;
9193 	}
9194 
9195 	/* START state is guaranteed to compute the bounds for the subord. */
9196 	if (!subord->nested_bounds_set) {
9197 		panic("%s: !subord->nested_bounds_set, "
9198 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9199 		    __func__, grand, subord, (void*)vstart, size, state);
9200 	}
9201 
9202 	if (state == PMAP_TRIM_STATE_SUBORD) {
9203 		/**
9204 		 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9205 		 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9206 		 * called once grand's nested tables have been fully trimmed, and can only be called once
9207 		 * for each 'grand' pmap.  We use release ordering for the atomics above to ensure that
9208 		 * the state update is visible only once the preceding trim operation is complete.  An
9209 		 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9210 		 * but locking within pmap_trim_range() should make that harmless (and all but one will
9211 		 * ultimately panic due to a failed atomic state CAS).  We use acquire ordering here to
9212 		 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9213 		 * of the state CAS.
9214 		 */
9215 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9216 		    NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9217 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9218 			    (unsigned int)grand->nested_no_bounds_ref_state);
9219 		}
9220 		pmap_trim_subord(subord);
9221 	}
9222 
9223 	return PMAP_TRIM_STATE_DONE;
9224 }
9225 
9226 MARK_AS_PMAP_TEXT static void
9227 pmap_trim_self(pmap_t pmap)
9228 {
9229 	if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9230 		/* If we have a no bounds ref, we need to drop it. */
9231 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9232 		pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9233 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9234 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9235 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9236 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9237 
9238 		if (nested_bounds_set) {
9239 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9240 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9241 		}
9242 		/*
9243 		 * Try trimming the nested pmap, in case we had the
9244 		 * last reference.
9245 		 */
9246 		pmap_trim_subord(pmap->nested_pmap);
9247 	}
9248 }
9249 
9250 /*
9251  * pmap_trim_subord(grand, subord)
9252  *
9253  * grand  = pmap that we have nested subord in
9254  * subord = nested pmap we are attempting to trim
9255  *
9256  * Trims subord if possible
9257  */
9258 MARK_AS_PMAP_TEXT static void
9259 pmap_trim_subord(pmap_t subord)
9260 {
9261 	bool contract_subord = false;
9262 
9263 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9264 
9265 	subord->nested_no_bounds_refcnt--;
9266 
9267 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9268 		/* If this was the last no bounds reference, trim subord. */
9269 		contract_subord = true;
9270 	}
9271 
9272 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9273 
9274 	if (contract_subord) {
9275 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9276 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9277 	}
9278 }
9279 
9280 /**
9281  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9282  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9283  * disabling preemption for too long.
9284  *
9285  * @note When we load the shared region we always create pages tables for the
9286  *       entire region. In practice, the shared cache may use just a portion
9287  *       of that. Before we know the bounds of the shared region, it can
9288  *       already be mapped into processes. Therefore, once the bounds are
9289  *       known, "trimming" comes in handy to remove the unnecessary page
9290  *       tables in the processes the shared region is mapped in, and eventually
9291  *       those in the shared region itself. Note that the shared region must
9292  *       be trimmed after the user processes because it has the L3 entries
9293  *       everyone else is pointing to.
9294  *
9295  * @param grand the pmap in which the pages are nested
9296  * @param subord the pmap from which the pages are shared, or nested
9297  * @param vstart start of the used range in "grand"
9298  * @param size size of the used range
9299  */
9300 void
9301 pmap_trim(
9302 	pmap_t grand,
9303 	pmap_t subord,
9304 	addr64_t vstart,
9305 	uint64_t size)
9306 {
9307 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9308 
9309 #if XNU_MONITOR
9310 	/* On PPL systems, drives the state machine until its done. */
9311 	while (state != PMAP_TRIM_STATE_DONE) {
9312 		__assert_only pmap_trim_state_t old_state = state;
9313 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9314 
9315 		/* Are we making progress? */
9316 		assert(old_state != state);
9317 	}
9318 
9319 	pmap_ledger_check_balance(grand);
9320 	pmap_ledger_check_balance(subord);
9321 #else
9322 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9323 
9324 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9325 	assert(state == PMAP_TRIM_STATE_DONE);
9326 #endif
9327 }
9328 
9329 #if HAS_APPLE_PAC
9330 void *
9331 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9332 {
9333 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9334 		panic("attempt to sign user pointer without process independent key");
9335 	}
9336 
9337 	void *res = NULL;
9338 	uint64_t current_intr_state = pmap_interrupts_disable();
9339 
9340 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9341 
9342 	__compiler_materialize_and_prevent_reordering_on(value);
9343 	switch (key) {
9344 	case ptrauth_key_asia:
9345 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9346 		break;
9347 	case ptrauth_key_asda:
9348 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9349 		break;
9350 	default:
9351 		__builtin_unreachable();
9352 	}
9353 	__compiler_materialize_and_prevent_reordering_on(res);
9354 
9355 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9356 
9357 	pmap_interrupts_restore(current_intr_state);
9358 
9359 	return res;
9360 }
9361 
9362 void *
9363 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9364 {
9365 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9366 }
9367 
9368 void *
9369 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9370 {
9371 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9372 		panic("attempt to auth user pointer without process independent key");
9373 	}
9374 
9375 	void *res = NULL;
9376 	uint64_t current_intr_state = pmap_interrupts_disable();
9377 
9378 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9379 	__compiler_materialize_and_prevent_reordering_on(value);
9380 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9381 	__compiler_materialize_and_prevent_reordering_on(res);
9382 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9383 
9384 	pmap_interrupts_restore(current_intr_state);
9385 
9386 	return res;
9387 }
9388 
9389 void *
9390 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9391 {
9392 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9393 }
9394 #endif /* HAS_APPLE_PAC */
9395 
9396 /*
9397  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9398  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9399  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9400  * return value, to indicate where a preempted [un]nest operation should resume.
9401  * When the return value contains the ending address of the nested region with
9402  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9403  */
9404 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9405 
9406 /*
9407  *	kern_return_t pmap_nest(grand, subord, vstart, size)
9408  *
9409  *	grand  = the pmap that we will nest subord into
9410  *	subord = the pmap that goes into the grand
9411  *	vstart  = start of range in pmap to be inserted
9412  *	size   = Size of nest area (up to 16TB)
9413  *
9414  *	Inserts a pmap into another.  This is used to implement shared segments.
9415  *
9416  */
9417 
9418 /**
9419  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9420  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9421  * This function operates in 3 main phases:
9422  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9423  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9424  *    the mapping range are present in subord.
9425  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9426  *    contains pointers to subord's leaf-level pagetable pages for the specified
9427  *    VA range.
9428  *
9429  * This function may return early due to pending AST_URGENT preemption; if so
9430  * it will indicate the need to be re-entered.
9431  *
9432  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9433  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9434  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9435  * @param size twig-aligned size of the nesting range
9436  * @param vrestart the twig-aligned starting address of the current call.  May contain
9437  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9438  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9439  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9440  *
9441  * @return the virtual address at which to restart the operation, possibly including
9442  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9443  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9444  */
9445 MARK_AS_PMAP_TEXT vm_map_offset_t
9446 pmap_nest_internal(
9447 	pmap_t grand,
9448 	pmap_t subord,
9449 	addr64_t vstart,
9450 	uint64_t size,
9451 	vm_map_offset_t vrestart,
9452 	kern_return_t *krp)
9453 {
9454 	kern_return_t kr = KERN_FAILURE;
9455 	vm_map_offset_t vaddr;
9456 	tt_entry_t     *stte_p;
9457 	tt_entry_t     *gtte_p;
9458 	uint64_t        nested_region_unnested_table_bitmap_size;
9459 	unsigned int*   nested_region_unnested_table_bitmap = NULL;
9460 	uint64_t        new_nested_region_unnested_table_bitmap_size;
9461 	unsigned int*   new_nested_region_unnested_table_bitmap = NULL;
9462 	int             expand_options = 0;
9463 	bool            deref_subord = true;
9464 	bool            grand_locked = false;
9465 
9466 	addr64_t vend;
9467 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9468 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9469 	}
9470 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9471 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9472 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9473 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9474 	}
9475 
9476 	assert(krp != NULL);
9477 	validate_pmap_mutable(grand);
9478 	validate_pmap(subord);
9479 #if XNU_MONITOR
9480 	/*
9481 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9482 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9483 	 * be in the process of being destroyed.  If destruction is already committed,
9484 	 * then the check of ref_count below will cover us.  If destruction is initiated
9485 	 * during or after this call, then pmap_destroy() will catch the non-zero
9486 	 * nested_count.
9487 	 */
9488 	os_atomic_inc(&subord->nested_count, relaxed);
9489 	os_atomic_thread_fence(seq_cst);
9490 #endif
9491 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9492 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9493 	}
9494 
9495 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9496 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9497 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9498 	}
9499 
9500 #if XNU_MONITOR
9501 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9502 #endif
9503 
9504 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9505 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9506 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9507 		    grand, vstart, size, (unsigned long long)vrestart);
9508 	}
9509 
9510 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9511 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9512 	}
9513 
9514 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9515 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9516 	}
9517 
9518 	if (subord->nested_region_unnested_table_bitmap == NULL) {
9519 		nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9520 
9521 		if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9522 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9523 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9524 			    __func__, nested_region_unnested_table_bitmap_size,
9525 			    grand, subord, vstart, size);
9526 		}
9527 
9528 #if XNU_MONITOR
9529 		pmap_paddr_t pa = 0;
9530 
9531 		if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9532 			panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9533 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9534 			    __FUNCTION__, nested_region_unnested_table_bitmap_size,
9535 			    grand, subord, vstart, size);
9536 		}
9537 
9538 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9539 
9540 		if (kr != KERN_SUCCESS) {
9541 			goto nest_cleanup;
9542 		}
9543 
9544 		assert(pa);
9545 
9546 		nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9547 #else
9548 		nested_region_unnested_table_bitmap = kalloc_data(
9549 			nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9550 			Z_WAITOK | Z_ZERO);
9551 #endif
9552 
9553 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9554 			kr = KERN_ABORTED;
9555 			goto nest_cleanup;
9556 		}
9557 
9558 		if (subord->nested_region_unnested_table_bitmap == NULL) {
9559 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9560 			subord->nested_region_addr = vstart;
9561 			subord->nested_region_size = (mach_vm_offset_t) size;
9562 
9563 			/**
9564 			 * Ensure that the rest of the subord->nested_region_* fields are
9565 			 * initialized and visible before setting the nested_region_unnested_table_bitmap
9566 			 * field (which is used as the flag to say that the rest are initialized).
9567 			 */
9568 			__builtin_arm_dmb(DMB_ISHST);
9569 			subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap;
9570 			nested_region_unnested_table_bitmap = NULL;
9571 		}
9572 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9573 		if (nested_region_unnested_table_bitmap != NULL) {
9574 #if XNU_MONITOR
9575 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9576 #else
9577 			kfree_data(nested_region_unnested_table_bitmap,
9578 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9579 #endif
9580 			nested_region_unnested_table_bitmap = NULL;
9581 		}
9582 	}
9583 
9584 	/**
9585 	 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9586 	 * speculated before their initialization.
9587 	 */
9588 	__builtin_arm_dmb(DMB_ISHLD);
9589 
9590 	if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9591 		uint64_t        new_size;
9592 
9593 		nested_region_unnested_table_bitmap = NULL;
9594 		nested_region_unnested_table_bitmap_size = 0ULL;
9595 		new_size =  vend - subord->nested_region_addr;
9596 
9597 		new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9598 
9599 		if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9600 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9601 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9602 			    __func__, new_nested_region_unnested_table_bitmap_size,
9603 			    grand, subord, vstart, size);
9604 		}
9605 
9606 #if XNU_MONITOR
9607 		pmap_paddr_t pa = 0;
9608 
9609 		if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9610 			panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9611 			    "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9612 			    __FUNCTION__, new_nested_region_unnested_table_bitmap_size,
9613 			    grand, subord, vstart, new_size);
9614 		}
9615 
9616 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9617 
9618 		if (kr != KERN_SUCCESS) {
9619 			goto nest_cleanup;
9620 		}
9621 
9622 		assert(pa);
9623 
9624 		new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9625 #else
9626 		new_nested_region_unnested_table_bitmap = kalloc_data(
9627 			new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9628 			Z_WAITOK | Z_ZERO);
9629 #endif
9630 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9631 			kr = KERN_ABORTED;
9632 			goto nest_cleanup;
9633 		}
9634 
9635 		if (subord->nested_region_size < new_size) {
9636 			bcopy(subord->nested_region_unnested_table_bitmap,
9637 			    new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size);
9638 			nested_region_unnested_table_bitmap_size  = subord->nested_region_unnested_table_bitmap_size;
9639 			nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap;
9640 			subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap;
9641 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size;
9642 			subord->nested_region_size = new_size;
9643 			new_nested_region_unnested_table_bitmap = NULL;
9644 		}
9645 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9646 		if (nested_region_unnested_table_bitmap != NULL) {
9647 #if XNU_MONITOR
9648 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9649 #else
9650 			kfree_data(nested_region_unnested_table_bitmap,
9651 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9652 #endif
9653 			nested_region_unnested_table_bitmap = NULL;
9654 		}
9655 		if (new_nested_region_unnested_table_bitmap != NULL) {
9656 #if XNU_MONITOR
9657 			pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9658 #else
9659 			kfree_data(new_nested_region_unnested_table_bitmap,
9660 			    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9661 #endif
9662 			new_nested_region_unnested_table_bitmap = NULL;
9663 		}
9664 	}
9665 
9666 	if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9667 		kr = KERN_ABORTED;
9668 		goto nest_cleanup;
9669 	}
9670 
9671 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) {
9672 		/**
9673 		 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9674 		 * into a nested pmap, which would then produce multiple levels of nesting.
9675 		 */
9676 		if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9677 			panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9678 		}
9679 		/*
9680 		 * If this is grand's first nesting operation, keep the reference on subord.
9681 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9682 		 */
9683 		deref_subord = false;
9684 
9685 		if (!subord->nested_bounds_set) {
9686 			/*
9687 			 * We are nesting without the shared regions bounds
9688 			 * being known.  We'll have to trim the pmap later.
9689 			 */
9690 			if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9691 			    NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9692 				panic("%s: grand %p already nested", __func__, grand);
9693 			}
9694 			subord->nested_no_bounds_refcnt++;
9695 		}
9696 
9697 		if (__improbable(vstart < subord->nested_region_addr ||
9698 		    vend > (subord->nested_region_addr + subord->nested_region_size))) {
9699 			panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9700 			    __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9701 			    (void *) (subord->nested_region_addr + subord->nested_region_size));
9702 		}
9703 
9704 		grand->nested_region_addr = vstart;
9705 		grand->nested_region_size = (mach_vm_offset_t) size;
9706 	} else {
9707 		if (__improbable(grand->nested_pmap != subord)) {
9708 			panic("pmap_nest() pmap %p has a nested pmap", grand);
9709 		} else if (__improbable(grand->nested_region_addr > vstart)) {
9710 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9711 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9712 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9713 		}
9714 	}
9715 
9716 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9717 	if (vaddr < subord->nested_region_true_start) {
9718 		vaddr = subord->nested_region_true_start;
9719 	}
9720 
9721 	addr64_t true_end = vend;
9722 	if (true_end > subord->nested_region_true_end) {
9723 		true_end = subord->nested_region_true_end;
9724 	}
9725 	__unused unsigned int ttecount = 0;
9726 
9727 	if (vrestart & PMAP_NEST_GRAND) {
9728 		goto nest_grand;
9729 	}
9730 
9731 	while (vaddr < true_end) {
9732 		stte_p = pmap_tte(subord, vaddr);
9733 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9734 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9735 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9736 
9737 			if (kr != KERN_SUCCESS) {
9738 				goto done;
9739 			}
9740 
9741 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9742 		}
9743 		vaddr += pt_attr_twig_size(pt_attr);
9744 		vrestart = vaddr;
9745 		++ttecount;
9746 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9747 		    pmap_pending_preemption())) {
9748 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9749 			kr = KERN_SUCCESS;
9750 			goto done;
9751 		}
9752 	}
9753 	/*
9754 	 * copy TTEs from subord pmap into grand pmap
9755 	 */
9756 
9757 	vaddr = (vm_map_offset_t) vstart;
9758 	if (vaddr < subord->nested_region_true_start) {
9759 		vaddr = subord->nested_region_true_start;
9760 	}
9761 	vrestart = vaddr | PMAP_NEST_GRAND;
9762 
9763 nest_grand:
9764 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9765 
9766 	if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9767 		kr = KERN_ABORTED;
9768 		goto done;
9769 	}
9770 	while (vaddr < true_end) {
9771 		stte_p = pmap_tte(subord, vaddr);
9772 		if (__improbable(stte_p == PT_ENTRY_NULL)) {
9773 			panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
9774 		}
9775 		gtte_p = pmap_tte(grand, vaddr);
9776 		if (gtte_p == PT_ENTRY_NULL) {
9777 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9778 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9779 			if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9780 				if (kr == KERN_SUCCESS) {
9781 					kr = KERN_ABORTED;
9782 				}
9783 			}
9784 
9785 			if (kr != KERN_SUCCESS) {
9786 				goto done;
9787 			}
9788 
9789 			gtte_p = pmap_tt2e(grand, vaddr);
9790 		}
9791 		/* Don't leak a page table page.  Don't violate break-before-make. */
9792 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9793 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9794 			    __func__, gtte_p, grand);
9795 		}
9796 		*gtte_p = *stte_p;
9797 
9798 		vaddr += pt_attr_twig_size(pt_attr);
9799 		vrestart = vaddr | PMAP_NEST_GRAND;
9800 		++ttecount;
9801 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9802 		    pmap_pending_preemption())) {
9803 			break;
9804 		}
9805 	}
9806 	if (vaddr >= true_end) {
9807 		vrestart = vend | PMAP_NEST_GRAND;
9808 	}
9809 
9810 	kr = KERN_SUCCESS;
9811 done:
9812 
9813 	FLUSH_PTE();
9814 	__builtin_arm_isb(ISB_SY);
9815 
9816 	if (grand_locked) {
9817 		pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9818 	}
9819 
9820 nest_cleanup:
9821 #if XNU_MONITOR
9822 	if (kr != KERN_SUCCESS) {
9823 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9824 		*krp = kr;
9825 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9826 	}
9827 #else
9828 	if (kr != KERN_SUCCESS) {
9829 		*krp = kr;
9830 	}
9831 #endif
9832 	if (nested_region_unnested_table_bitmap != NULL) {
9833 #if XNU_MONITOR
9834 		pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9835 #else
9836 		kfree_data(nested_region_unnested_table_bitmap,
9837 		    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9838 #endif
9839 	}
9840 	if (new_nested_region_unnested_table_bitmap != NULL) {
9841 #if XNU_MONITOR
9842 		pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9843 #else
9844 		kfree_data(new_nested_region_unnested_table_bitmap,
9845 		    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9846 #endif
9847 	}
9848 	if (deref_subord) {
9849 #if XNU_MONITOR
9850 		os_atomic_dec(&subord->nested_count, relaxed);
9851 #endif
9852 		pmap_destroy_internal(subord);
9853 	}
9854 	return vrestart;
9855 }
9856 
9857 kern_return_t
9858 pmap_nest(
9859 	pmap_t grand,
9860 	pmap_t subord,
9861 	addr64_t vstart,
9862 	uint64_t size)
9863 {
9864 	kern_return_t kr = KERN_SUCCESS;
9865 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9866 	vm_map_offset_t vend = vaddr + size;
9867 	__unused vm_map_offset_t vlast = vaddr;
9868 
9869 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9870 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9871 	    VM_KERNEL_ADDRHIDE(vstart));
9872 
9873 	pmap_verify_preemptible();
9874 #if XNU_MONITOR
9875 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
9876 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9877 		if (kr == KERN_RESOURCE_SHORTAGE) {
9878 			pmap_alloc_page_for_ppl(0);
9879 			kr = KERN_SUCCESS;
9880 		} else if (kr == KERN_ABORTED) {
9881 			/**
9882 			 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
9883 			 * that it won't update kr when KERN_SUCCESS is to be returned.
9884 			 * Therefore, the KERN_ABORTED needs to be manually cleared here,
9885 			 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
9886 			 */
9887 			kr = KERN_SUCCESS;
9888 			continue;
9889 		} else if (kr != KERN_SUCCESS) {
9890 			break;
9891 		} else if (vaddr == vlast) {
9892 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9893 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9894 		}
9895 		vlast = vaddr;
9896 	}
9897 
9898 	pmap_ledger_check_balance(grand);
9899 	pmap_ledger_check_balance(subord);
9900 #else
9901 	/**
9902 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9903 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9904 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9905 	 */
9906 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9907 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9908 	}
9909 #endif
9910 
9911 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9912 
9913 	return kr;
9914 }
9915 
9916 /*
9917  *	kern_return_t pmap_unnest(grand, vaddr)
9918  *
9919  *	grand  = the pmap that will have the virtual range unnested
9920  *	vaddr  = start of range in pmap to be unnested
9921  *	size   = size of range in pmap to be unnested
9922  *
9923  */
9924 
9925 kern_return_t
9926 pmap_unnest(
9927 	pmap_t grand,
9928 	addr64_t vaddr,
9929 	uint64_t size)
9930 {
9931 	return pmap_unnest_options(grand, vaddr, size, 0);
9932 }
9933 
9934 /**
9935  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9936  * from a top-level pmap ('grand').  The corresponding mappings in the nested
9937  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9938  * still have the region nested.  The mappings in 'grand' will be left empty
9939  * with the assumption that they will be demand-filled by subsequent access faults.
9940  *
9941  * This function operates in 2 main phases:
9942  * 1. Iteration over the nested pmap's mappings for the specified range to mark
9943  *    them non-global.
9944  * 2. Clearing of the twig-level TTEs for the address range in grand.
9945  *
9946  * This function may return early due to pending AST_URGENT preemption; if so
9947  * it will indicate the need to be re-entered.
9948  *
9949  * @param grand pmap from which to unnest mappings
9950  * @param vaddr twig-aligned virtual address for the beginning of the nested range
9951  * @param size twig-aligned size of the nested range
9952  * @param vrestart the page-aligned starting address of the current call.  May contain
9953  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9954  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9955  *        grand is being torn down and step 1) above is not needed.
9956  *
9957  * @return the virtual address at which to restart the operation, possibly including
9958  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9959  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9960  */
9961 MARK_AS_PMAP_TEXT vm_map_offset_t
9962 pmap_unnest_options_internal(
9963 	pmap_t grand,
9964 	addr64_t vaddr,
9965 	uint64_t size,
9966 	vm_map_offset_t vrestart,
9967 	unsigned int option)
9968 {
9969 	vm_map_offset_t start;
9970 	vm_map_offset_t addr;
9971 	tt_entry_t     *tte_p;
9972 	unsigned int    current_index;
9973 	unsigned int    start_index;
9974 	unsigned int    max_index;
9975 	unsigned int    entry_count = 0;
9976 
9977 	addr64_t vend;
9978 	addr64_t true_end;
9979 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9980 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9981 	}
9982 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9983 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9984 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9985 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9986 	}
9987 
9988 	validate_pmap_mutable(grand);
9989 
9990 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9991 
9992 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9993 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9994 		    (unsigned long long)vaddr, (unsigned long long)size);
9995 	}
9996 
9997 	if (__improbable(grand->nested_pmap == NULL)) {
9998 		panic("%s: %p has no nested pmap", __func__, grand);
9999 	}
10000 
10001 	true_end = vend;
10002 	if (true_end > grand->nested_pmap->nested_region_true_end) {
10003 		true_end = grand->nested_pmap->nested_region_true_end;
10004 	}
10005 
10006 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10007 		if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10008 			panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10009 		}
10010 
10011 		if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10012 			return vrestart;
10013 		}
10014 
10015 		start = vrestart;
10016 		if (start < grand->nested_pmap->nested_region_true_start) {
10017 			start = grand->nested_pmap->nested_region_true_start;
10018 		}
10019 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10020 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10021 		bool flush_tlb = false;
10022 
10023 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10024 			pt_entry_t  *bpte, *cpte;
10025 
10026 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10027 
10028 			bpte = pmap_pte(grand->nested_pmap, addr);
10029 
10030 			/*
10031 			 * If we've re-entered this function partway through unnesting a leaf region, the
10032 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10033 			 * the run of PTEs.  We therefore also need to check for a non-twig-aligned starting
10034 			 * address.
10035 			 */
10036 			if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10037 			    (addr & pt_attr_twig_offmask(pt_attr))) {
10038 				/*
10039 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
10040 				 * the nested pmap in this region will now be marked non-global.  Do this
10041 				 * before marking any of the PTEs within the region as non-global to avoid
10042 				 * the possibility of pmap_enter() subsequently inserting a global mapping
10043 				 * in the region, which could lead to a TLB conflict if a non-global entry
10044 				 * is later inserted for the same VA in a pmap which has fully unnested this
10045 				 * region.
10046 				 */
10047 				setbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10048 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10049 					pmap_paddr_t    pa;
10050 					unsigned int    pai = 0;
10051 					boolean_t               managed = FALSE;
10052 					pt_entry_t  spte;
10053 
10054 					if ((*cpte != ARM_PTE_TYPE_FAULT)
10055 					    && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10056 						spte = *((volatile pt_entry_t*)cpte);
10057 						while (!managed) {
10058 							pa = pte_to_pa(spte);
10059 							if (!pa_valid(pa)) {
10060 								break;
10061 							}
10062 							pai = pa_index(pa);
10063 							pvh_lock(pai);
10064 							spte = *((volatile pt_entry_t*)cpte);
10065 							pa = pte_to_pa(spte);
10066 							if (pai == pa_index(pa)) {
10067 								managed = TRUE;
10068 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
10069 							}
10070 							pvh_unlock(pai);
10071 						}
10072 
10073 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10074 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
10075 							flush_tlb = true;
10076 						}
10077 
10078 						if (managed) {
10079 							pvh_assert_locked(pai);
10080 							pvh_unlock(pai);
10081 						}
10082 					}
10083 
10084 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10085 					vrestart = addr;
10086 					++entry_count;
10087 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10088 					    pmap_pending_preemption())) {
10089 						goto unnest_subord_done;
10090 					}
10091 				}
10092 			}
10093 			addr = vlim;
10094 			vrestart = addr;
10095 			++entry_count;
10096 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10097 			    pmap_pending_preemption())) {
10098 				break;
10099 			}
10100 		}
10101 
10102 unnest_subord_done:
10103 		if (flush_tlb) {
10104 			FLUSH_PTE_STRONG();
10105 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10106 		}
10107 
10108 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10109 		if (current_index < max_index) {
10110 			return vrestart;
10111 		}
10112 	}
10113 
10114 	/*
10115 	 * invalidate all pdes for segment at vaddr in pmap grand
10116 	 */
10117 	if (vrestart & PMAP_NEST_GRAND) {
10118 		addr = vrestart & ~PMAP_NEST_GRAND;
10119 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10120 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10121 		}
10122 	} else {
10123 		addr = vaddr;
10124 		vrestart = vaddr | PMAP_NEST_GRAND;
10125 	}
10126 
10127 	/**
10128 	 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10129 	 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10130 	 * upon reentry.
10131 	 */
10132 	if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10133 		return vrestart;
10134 	}
10135 
10136 	if (addr < grand->nested_pmap->nested_region_true_start) {
10137 		addr = grand->nested_pmap->nested_region_true_start;
10138 	}
10139 
10140 	start = addr;
10141 
10142 	while (addr < true_end) {
10143 		tte_p = pmap_tte(grand, addr);
10144 		/*
10145 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10146 		 * so it's possible that a region we're trying to unnest may not have been
10147 		 * nested in the first place.
10148 		 */
10149 		if (tte_p != NULL) {
10150 			*tte_p = ARM_TTE_TYPE_FAULT;
10151 		}
10152 		addr += pt_attr_twig_size(pt_attr);
10153 		vrestart = addr | PMAP_NEST_GRAND;
10154 		++entry_count;
10155 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10156 		    pmap_pending_preemption())) {
10157 			break;
10158 		}
10159 	}
10160 	if (addr >= true_end) {
10161 		vrestart = vend | PMAP_NEST_GRAND;
10162 	}
10163 
10164 	FLUSH_PTE_STRONG();
10165 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10166 
10167 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10168 
10169 	return vrestart;
10170 }
10171 
10172 kern_return_t
10173 pmap_unnest_options(
10174 	pmap_t grand,
10175 	addr64_t vaddr,
10176 	uint64_t size,
10177 	unsigned int option)
10178 {
10179 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10180 	vm_map_offset_t vend = vaddr + size;
10181 
10182 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10183 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10184 
10185 	pmap_verify_preemptible();
10186 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
10187 #if XNU_MONITOR
10188 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10189 #else
10190 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10191 #endif
10192 	}
10193 
10194 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10195 
10196 	return KERN_SUCCESS;
10197 }
10198 
10199 boolean_t
10200 pmap_adjust_unnest_parameters(
10201 	__unused pmap_t p,
10202 	__unused vm_map_offset_t *s,
10203 	__unused vm_map_offset_t *e)
10204 {
10205 	return TRUE; /* to get to log_unnest_badness()... */
10206 }
10207 
10208 #if PMAP_FORK_NEST
10209 /**
10210  * Perform any necessary pre-nesting of the parent's shared region at fork()
10211  * time.
10212  *
10213  * @note This should only be called from vm_map_fork().
10214  *
10215  * @param old_pmap The pmap of the parent task.
10216  * @param new_pmap The pmap of the child task.
10217  * @param nesting_start An output parameter that is updated with the start
10218  *                      address of the range that was pre-nested
10219  * @param nesting_end An output parameter that is updated with the end
10220  *                      address of the range that was pre-nested
10221  *
10222  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10223  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
10224  */
10225 kern_return_t
10226 pmap_fork_nest(
10227 	pmap_t old_pmap,
10228 	pmap_t new_pmap,
10229 	vm_map_offset_t *nesting_start,
10230 	vm_map_offset_t *nesting_end)
10231 {
10232 	if (old_pmap == NULL || new_pmap == NULL) {
10233 		return KERN_INVALID_ARGUMENT;
10234 	}
10235 	if (old_pmap->nested_pmap == NULL) {
10236 		return KERN_SUCCESS;
10237 	}
10238 	pmap_nest(new_pmap,
10239 	    old_pmap->nested_pmap,
10240 	    old_pmap->nested_region_addr,
10241 	    old_pmap->nested_region_size);
10242 	assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10243 	    new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10244 	    new_pmap->nested_region_size == old_pmap->nested_region_size,
10245 	    "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10246 	    new_pmap->nested_pmap,
10247 	    new_pmap->nested_region_addr,
10248 	    new_pmap->nested_region_size,
10249 	    old_pmap->nested_pmap,
10250 	    old_pmap->nested_region_addr,
10251 	    old_pmap->nested_region_size);
10252 	*nesting_start = old_pmap->nested_region_addr;
10253 	*nesting_end = *nesting_start + old_pmap->nested_region_size;
10254 	return KERN_SUCCESS;
10255 }
10256 #endif /* PMAP_FORK_NEST */
10257 
10258 /*
10259  * disable no-execute capability on
10260  * the specified pmap
10261  */
10262 #if DEVELOPMENT || DEBUG
10263 void
10264 pmap_disable_NX(
10265 	pmap_t pmap)
10266 {
10267 	pmap->nx_enabled = FALSE;
10268 }
10269 #else
10270 void
10271 pmap_disable_NX(
10272 	__unused pmap_t pmap)
10273 {
10274 }
10275 #endif
10276 
10277 /*
10278  * flush a range of hardware TLB entries.
10279  * NOTE: assumes the smallest TLB entry in use will be for
10280  * an ARM small page (4K).
10281  */
10282 
10283 #if __ARM_RANGE_TLBI__
10284 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10285 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
10286 #else
10287 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10288 #endif // __ARM_RANGE_TLBI__
10289 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10290     "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10291     "of npages to 32 bits below may truncate.");
10292 
10293 static void
10294 flush_mmu_tlb_region_asid_async(
10295 	vm_offset_t va,
10296 	size_t length,
10297 	pmap_t pmap,
10298 	bool last_level_only __unused,
10299 	bool strong __unused)
10300 {
10301 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10302 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10303 	size_t npages = length >> pmap_page_shift;
10304 	uint32_t asid;
10305 
10306 	asid = pmap->hw_asid;
10307 
10308 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10309 		boolean_t       flush_all = FALSE;
10310 
10311 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10312 			flush_all = TRUE;
10313 		}
10314 		if (flush_all) {
10315 			flush_mmu_tlb_async();
10316 		} else {
10317 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10318 		}
10319 		return;
10320 	}
10321 #if __ARM_RANGE_TLBI__
10322 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10323 		/**
10324 		 * Note that casting npages to 32 bits here is always safe thanks to
10325 		 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10326 		 */
10327 		va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10328 		if (pmap->type == PMAP_TYPE_NESTED) {
10329 			flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10330 		} else {
10331 			flush_mmu_tlb_range_async(va, last_level_only, strong);
10332 		}
10333 		return;
10334 	}
10335 #endif
10336 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10337 	va = tlbi_asid(asid) | tlbi_addr(va);
10338 
10339 	if (pmap->type == PMAP_TYPE_NESTED) {
10340 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10341 	} else {
10342 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10343 	}
10344 }
10345 
10346 MARK_AS_PMAP_TEXT static void
10347 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10348 {
10349 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10350 }
10351 
10352 void
10353 flush_mmu_tlb_region(
10354 	vm_offset_t va,
10355 	unsigned length)
10356 {
10357 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10358 	sync_tlb_flush();
10359 }
10360 
10361 unsigned int
10362 pmap_cache_attributes(
10363 	ppnum_t pn)
10364 {
10365 	pmap_paddr_t    paddr;
10366 	unsigned int    pai;
10367 	unsigned int    result;
10368 	pp_attr_t       pp_attr_current;
10369 
10370 	paddr = ptoa(pn);
10371 
10372 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10373 
10374 	if (!pa_valid(paddr)) {
10375 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10376 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10377 	}
10378 
10379 	result = VM_WIMG_DEFAULT;
10380 
10381 	pai = pa_index(paddr);
10382 
10383 	pp_attr_current = pp_attr_table[pai];
10384 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10385 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10386 	}
10387 	return result;
10388 }
10389 
10390 MARK_AS_PMAP_TEXT static void
10391 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10392 {
10393 	if ((wimg_bits_prev != wimg_bits_new)
10394 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10395 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10396 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10397 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10398 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10399 		pmap_sync_page_attributes_phys(pn);
10400 	}
10401 
10402 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10403 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10404 	}
10405 }
10406 
10407 MARK_AS_PMAP_TEXT __unused void
10408 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10409 {
10410 	pmap_paddr_t paddr = ptoa(pn);
10411 	const unsigned int pai = pa_index(paddr);
10412 
10413 	if (__improbable(!pa_valid(paddr))) {
10414 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10415 	}
10416 
10417 	pvh_lock(pai);
10418 
10419 #if XNU_MONITOR
10420 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10421 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10422 	}
10423 #endif
10424 
10425 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10426 
10427 	pvh_unlock(pai);
10428 
10429 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10430 }
10431 
10432 void *
10433 pmap_map_compressor_page(ppnum_t pn)
10434 {
10435 #if __ARM_PTE_PHYSMAP__
10436 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10437 	if (cacheattr != VM_WIMG_DEFAULT) {
10438 #if XNU_MONITOR
10439 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10440 #else
10441 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10442 #endif
10443 	}
10444 #endif
10445 	return (void*)phystokv(ptoa(pn));
10446 }
10447 
10448 void
10449 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10450 {
10451 #if __ARM_PTE_PHYSMAP__
10452 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10453 	if (cacheattr != VM_WIMG_DEFAULT) {
10454 #if XNU_MONITOR
10455 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10456 #else
10457 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10458 #endif
10459 	}
10460 #endif
10461 }
10462 
10463 /**
10464  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10465  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10466  *
10467  * @param user_page_list List of pages to be updated.
10468  * @param page_cnt Number of pages in total in user_page_list.
10469  * @param cacheattr The new cache attribute.
10470  *
10471  * @return Success if true is returned.
10472  */
10473 bool
10474 pmap_batch_set_cache_attributes(
10475 	upl_page_info_array_t user_page_list,
10476 	unsigned int page_cnt,
10477 	unsigned int cacheattr)
10478 {
10479 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10480 
10481 	if (page_cnt == 0) {
10482 		return true;
10483 	}
10484 
10485 	batch_set_cache_attr_state_t states;
10486 	states.page_index = 0;
10487 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10488 	states.tlb_flush_pass_needed = false;
10489 	states.rt_cache_flush_pass_needed = false;
10490 
10491 	/* Verify we are being called from a preemptible context. */
10492 	pmap_verify_preemptible();
10493 
10494 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10495 #if XNU_MONITOR
10496 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10497 #else /* !XNU_MONITOR */
10498 		states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10499 #endif /* XNU_MONITOR */
10500 	}
10501 
10502 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10503 	return true;
10504 }
10505 
10506 /**
10507  * Flushes TLB entries associated with the page specified by paddr, but do not
10508  * issue barriers yet.
10509  *
10510  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10511  */
10512 MARK_AS_PMAP_TEXT static void
10513 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10514 {
10515 #if __ARM_PTE_PHYSMAP__
10516 	/* Flush the physical aperture mappings. */
10517 	const vm_offset_t kva = phystokv(paddr);
10518 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10519 #endif /* __ARM_PTE_PHYSMAP__ */
10520 
10521 	/* Flush the mappings tracked in the ptes. */
10522 	const unsigned int pai = pa_index(paddr);
10523 	pv_entry_t **pv_h = pai_to_pvh(pai);
10524 
10525 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10526 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10527 
10528 	pvh_assert_locked(pai);
10529 
10530 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10531 		pte_p = pvh_ptep(pv_h);
10532 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10533 		pve_p = pvh_pve_list(pv_h);
10534 		pte_p = PT_ENTRY_NULL;
10535 	}
10536 
10537 	int pve_ptep_idx = 0;
10538 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10539 		if (pve_p != PV_ENTRY_NULL) {
10540 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10541 			if (pte_p == PT_ENTRY_NULL) {
10542 				goto flush_tlb_skip_pte;
10543 			}
10544 		}
10545 
10546 #ifdef PVH_FLAG_IOMMU
10547 		if (pvh_ptep_is_iommu(pte_p)) {
10548 			goto flush_tlb_skip_pte;
10549 		}
10550 #endif /* PVH_FLAG_IOMMU */
10551 		pmap_t pmap = ptep_get_pmap(pte_p);
10552 		vm_map_address_t va = ptep_get_va(pte_p);
10553 
10554 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10555 		    pmap, true, false);
10556 
10557 flush_tlb_skip_pte:
10558 		pte_p = PT_ENTRY_NULL;
10559 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10560 			pve_ptep_idx = 0;
10561 			pve_p = pve_next(pve_p);
10562 		}
10563 	}
10564 }
10565 
10566 /**
10567  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10568  *
10569  * @param pai The Physical Address Index of the entry.
10570  * @param cacheattr The new cache attribute.
10571  */
10572 MARK_AS_PMAP_TEXT static void
10573 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10574 {
10575 	pvh_assert_locked(pai);
10576 
10577 	pp_attr_t pp_attr_current, pp_attr_template;
10578 	do {
10579 		pp_attr_current = pp_attr_table[pai];
10580 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10581 
10582 		/**
10583 		 * WIMG bits should only be updated under the PVH lock, but we should do
10584 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10585 		 */
10586 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10587 }
10588 
10589 /**
10590  * Batch updates the cache attributes of a list of pages in three passes.
10591  *
10592  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10593  * In pass two, TLB entries are flushed for each page in the list if necessary.
10594  * In pass three, caches are cleaned for each page in the list if necessary.
10595  *
10596  * When running in PPL, this function may decide to return to the caller in response
10597  * to AST_URGENT.
10598  *
10599  * @param user_page_list List of pages to be updated.
10600  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10601  * @param page_cnt Number of pages in total in user_page_list.
10602  * @param cacheattr The new cache attributes.
10603  *
10604  * @return The new state of the state machine.
10605  */
10606 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10607 pmap_batch_set_cache_attributes_internal(
10608 #if XNU_MONITOR
10609 	volatile upl_page_info_t *user_page_list,
10610 #else /* !XNU_MONITOR */
10611 	upl_page_info_array_t user_page_list,
10612 #endif /* XNU_MONITOR */
10613 	batch_set_cache_attr_state_t states,
10614 	unsigned int page_cnt,
10615 	unsigned int cacheattr)
10616 {
10617 	uint64_t page_index = states.page_index;
10618 	uint64_t state = states.state;
10619 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10620 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10621 
10622 	/* For verifying progress. */
10623 	__assert_only const uint64_t page_index_old = page_index;
10624 	__assert_only const uint64_t state_old = state;
10625 
10626 	/* Assert page_index and state are within their range. */
10627 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10628 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10629 	}
10630 
10631 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10632 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10633 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10634 		while (page_index < page_cnt) {
10635 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10636 			const pmap_paddr_t paddr = ptoa(pn);
10637 
10638 			if (!pa_valid(paddr)) {
10639 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10640 			}
10641 
10642 			const unsigned int pai = pa_index(paddr);
10643 
10644 			/* Lock the page. */
10645 			pvh_lock(pai);
10646 
10647 #if XNU_MONITOR
10648 			if (ppattr_pa_test_monitor(paddr)) {
10649 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10650 			}
10651 #endif /* XNU_MONITOR */
10652 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10653 
10654 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10655 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10656 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10657 			}
10658 
10659 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10660 
10661 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10662 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10663 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10664 			}
10665 
10666 			/* Update the cache attributes in PTE and PP_ATTR table. */
10667 			if (wimg_bits_new != wimg_bits_prev) {
10668 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10669 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10670 			}
10671 
10672 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10673 				rt_cache_flush_pass_needed = true;
10674 			}
10675 
10676 			pvh_unlock(pai);
10677 
10678 			page_index++;
10679 
10680 #if XNU_MONITOR
10681 			/**
10682 			 * Check for AST_URGENT every page, as the pve list search in cache
10683 			 * update can take non-constant time.
10684 			 */
10685 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10686 				goto pbscai_exit;
10687 			}
10688 #endif /* XNU_MONITOR */
10689 		}
10690 
10691 		/* page_index == page_cnt && !pmap_pending_preemption() */
10692 		if (tlb_flush_pass_needed) {
10693 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10694 		} else if (rt_cache_flush_pass_needed) {
10695 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10696 		} else {
10697 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10698 		}
10699 		page_index = 0;
10700 
10701 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10702 		FLUSH_PTE_STRONG();
10703 
10704 #if XNU_MONITOR
10705 		if (__improbable(pmap_pending_preemption())) {
10706 			goto pbscai_exit;
10707 		}
10708 #endif /* XNU_MONITOR */
10709 	}
10710 
10711 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10712 		/**
10713 		 * Pass 2: for each physical page and for each mapping, we need to flush
10714 		 * the TLB for it.
10715 		 */
10716 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10717 		while (page_index < page_cnt) {
10718 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10719 
10720 			const pmap_paddr_t paddr = ptoa(pn);
10721 			if (!pa_valid(paddr)) {
10722 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10723 			}
10724 
10725 			const unsigned int pai = pa_index(paddr);
10726 
10727 			pvh_lock(pai);
10728 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10729 			pvh_unlock(pai);
10730 
10731 			page_index++;
10732 
10733 #if XNU_MONITOR
10734 			/**
10735 			 * Check for AST_URGENT every page, as the pve list search in cache
10736 			 * update can take non-constant time.
10737 			 */
10738 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10739 				goto pbscai_exit;
10740 			}
10741 #endif /* XNU_MONITOR */
10742 		}
10743 
10744 #if HAS_FEAT_XS
10745 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10746 		arm64_sync_tlb(false);
10747 #else
10748 		/**
10749 		 * For targets that distinguish between mild and strong DSB, mild DSB
10750 		 * will not drain the prefetcher.  This can lead to prefetch-driven
10751 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
10752 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10753 		 */
10754 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10755 #endif
10756 
10757 		if (rt_cache_flush_pass_needed) {
10758 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10759 		} else {
10760 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10761 		}
10762 		page_index = 0;
10763 
10764 #if XNU_MONITOR
10765 		if (__improbable(pmap_pending_preemption())) {
10766 			goto pbscai_exit;
10767 		}
10768 #endif /* XNU_MONITOR */
10769 	}
10770 
10771 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10772 		/* Pass 3: Flush the cache if the page is recently set to RT */
10773 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10774 #if !XNU_MONITOR
10775 		/**
10776 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10777 		 * in the state where DC by VA instructions remain enabled.
10778 		 */
10779 		disable_preemption();
10780 #endif /* !XNU_MONITOR */
10781 
10782 		assert(get_preemption_level() > 0);
10783 
10784 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10785 		/**
10786 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10787 		 * and the host will handle cache maintenance for it. So we don't need to
10788 		 * worry about enabling the ops here for AVP.
10789 		 */
10790 		enable_dc_mva_ops();
10791 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10792 
10793 		while (page_index < page_cnt) {
10794 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10795 
10796 			if (!pa_valid(paddr)) {
10797 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10798 			}
10799 
10800 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10801 
10802 			page_index++;
10803 
10804 #if XNU_MONITOR
10805 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10806 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10807 				disable_dc_mva_ops();
10808 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10809 				goto pbscai_exit;
10810 			}
10811 #endif /* XNU_MONITOR */
10812 		}
10813 
10814 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10815 		disable_dc_mva_ops();
10816 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10817 
10818 #if !XNU_MONITOR
10819 		enable_preemption();
10820 #endif /* !XNU_MONITOR */
10821 
10822 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10823 		page_index = 0;
10824 	}
10825 
10826 #if XNU_MONITOR
10827 pbscai_exit:
10828 #endif /* XNU_MONITOR */
10829 	/* Assert page_index and state are within their range. */
10830 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10831 
10832 	/* Make sure we are making progress in this call. */
10833 	assert(page_index > page_index_old || state > state_old);
10834 
10835 	batch_set_cache_attr_state_t states_new;
10836 	states_new.page_index = page_index;
10837 	states_new.state = state;
10838 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10839 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10840 	return states_new;
10841 }
10842 
10843 MARK_AS_PMAP_TEXT static void
10844 pmap_set_cache_attributes_priv(
10845 	ppnum_t pn,
10846 	unsigned int cacheattr,
10847 	boolean_t external __unused)
10848 {
10849 	pmap_paddr_t    paddr;
10850 	unsigned int    pai;
10851 	pp_attr_t       pp_attr_current;
10852 	pp_attr_t       pp_attr_template;
10853 	unsigned int    wimg_bits_prev, wimg_bits_new;
10854 
10855 	paddr = ptoa(pn);
10856 
10857 	if (!pa_valid(paddr)) {
10858 		return;                         /* Not a managed page. */
10859 	}
10860 
10861 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
10862 		cacheattr = VM_WIMG_DEFAULT;
10863 	}
10864 
10865 	pai = pa_index(paddr);
10866 
10867 	pvh_lock(pai);
10868 
10869 #if XNU_MONITOR
10870 	if (external && ppattr_pa_test_monitor(paddr)) {
10871 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10872 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
10873 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10874 	}
10875 #endif
10876 
10877 	do {
10878 		pp_attr_current = pp_attr_table[pai];
10879 		wimg_bits_prev = VM_WIMG_DEFAULT;
10880 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10881 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10882 		}
10883 
10884 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10885 
10886 		/**
10887 		 * WIMG bits should only be updated under the PVH lock, but we should do
10888 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10889 		 */
10890 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10891 
10892 	wimg_bits_new = VM_WIMG_DEFAULT;
10893 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10894 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10895 	}
10896 
10897 	if (wimg_bits_new != wimg_bits_prev) {
10898 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
10899 	}
10900 
10901 	pvh_unlock(pai);
10902 
10903 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10904 }
10905 
10906 MARK_AS_PMAP_TEXT void
10907 pmap_set_cache_attributes_internal(
10908 	ppnum_t pn,
10909 	unsigned int cacheattr)
10910 {
10911 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10912 }
10913 
10914 void
10915 pmap_set_cache_attributes(
10916 	ppnum_t pn,
10917 	unsigned int cacheattr)
10918 {
10919 #if XNU_MONITOR
10920 	pmap_set_cache_attributes_ppl(pn, cacheattr);
10921 #else
10922 	pmap_set_cache_attributes_internal(pn, cacheattr);
10923 #endif
10924 }
10925 
10926 /**
10927  * Updates the page numbered ppnum to have attribute specified by attributes.
10928  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10929  * The necessity of the TLB flush is returned in case this function is called
10930  * in a batched manner and the TLB flush is intended to be done at a different
10931  * timing.
10932  *
10933  * @param ppnum Page Number of the page to be updated.
10934  * @param attributes The new cache attributes.
10935  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
10936  *        immediately.
10937  *
10938  * @return Returns true if a TLB flush is needed for this update regardless of
10939  *         whether a flush has occurred already.
10940  */
10941 MARK_AS_PMAP_TEXT bool
10942 pmap_update_cache_attributes_locked(
10943 	ppnum_t ppnum,
10944 	unsigned attributes,
10945 	bool perform_tlbi)
10946 {
10947 	pmap_paddr_t    phys = ptoa(ppnum);
10948 	pv_entry_t      *pve_p;
10949 	pt_entry_t      *pte_p;
10950 	pv_entry_t      **pv_h;
10951 	pt_entry_t      tmplate;
10952 	unsigned int    pai;
10953 	boolean_t       tlb_flush_needed = false;
10954 
10955 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10956 
10957 	if (pmap_panic_dev_wimg_on_managed) {
10958 		switch (attributes & VM_WIMG_MASK) {
10959 		case VM_WIMG_IO:                        // nGnRnE
10960 		case VM_WIMG_POSTED:                    // nGnRE
10961 		/* supported on DRAM, but slow, so we disallow */
10962 
10963 		case VM_WIMG_POSTED_REORDERED:          // nGRE
10964 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10965 			/* unsupported on DRAM */
10966 
10967 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10968 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10969 			break;
10970 
10971 		default:
10972 			/* not device type memory, all good */
10973 
10974 			break;
10975 		}
10976 	}
10977 
10978 #if __ARM_PTE_PHYSMAP__
10979 	vm_offset_t kva = phystokv(phys);
10980 	pte_p = pmap_pte(kernel_pmap, kva);
10981 
10982 	tmplate = *pte_p;
10983 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10984 #if XNU_MONITOR
10985 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10986 #else
10987 	tmplate |= wimg_to_pte(attributes, phys);
10988 #endif
10989 	if (tmplate & ARM_PTE_HINT_MASK) {
10990 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10991 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
10992 	}
10993 
10994 	if (perform_tlbi) {
10995 		write_pte_strong(pte_p, tmplate);
10996 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10997 	} else {
10998 		write_pte_fast(pte_p, tmplate);
10999 	}
11000 	tlb_flush_needed = true;
11001 #endif
11002 
11003 	pai = pa_index(phys);
11004 
11005 	pv_h = pai_to_pvh(pai);
11006 
11007 	pte_p = PT_ENTRY_NULL;
11008 	pve_p = PV_ENTRY_NULL;
11009 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11010 		pte_p = pvh_ptep(pv_h);
11011 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11012 		pve_p = pvh_pve_list(pv_h);
11013 		pte_p = PT_ENTRY_NULL;
11014 	}
11015 
11016 	int pve_ptep_idx = 0;
11017 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11018 		vm_map_address_t va;
11019 		pmap_t          pmap;
11020 
11021 		if (pve_p != PV_ENTRY_NULL) {
11022 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11023 			if (pte_p == PT_ENTRY_NULL) {
11024 				goto cache_skip_pve;
11025 			}
11026 		}
11027 
11028 #ifdef PVH_FLAG_IOMMU
11029 		if (pvh_ptep_is_iommu(pte_p)) {
11030 			goto cache_skip_pve;
11031 		}
11032 #endif
11033 		pmap = ptep_get_pmap(pte_p);
11034 #if HAS_FEAT_XS
11035 		/**
11036 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11037 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11038 		 */
11039 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11040 #endif /* HAS_FEAT_XS */
11041 		va = ptep_get_va(pte_p);
11042 
11043 		tmplate = *pte_p;
11044 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11045 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11046 
11047 		if (perform_tlbi) {
11048 			write_pte_strong(pte_p, tmplate);
11049 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11050 			    pmap, true, false);
11051 		} else {
11052 			write_pte_fast(pte_p, tmplate);
11053 		}
11054 		tlb_flush_needed = true;
11055 
11056 cache_skip_pve:
11057 		pte_p = PT_ENTRY_NULL;
11058 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11059 			pve_ptep_idx = 0;
11060 			pve_p = pve_next(pve_p);
11061 		}
11062 	}
11063 	if (perform_tlbi && tlb_flush_needed) {
11064 #if HAS_FEAT_XS
11065 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11066 		arm64_sync_tlb(false);
11067 #else
11068 		/**
11069 		 * For targets that distinguish between mild and strong DSB, mild DSB
11070 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11071 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11072 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11073 		 */
11074 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11075 #endif
11076 	}
11077 
11078 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11079 
11080 	return tlb_flush_needed;
11081 }
11082 
11083 /**
11084  * Mark a pmap as being dedicated to use for a commpage mapping.
11085  * The pmap itself will never be activated on a CPU; its mappings will
11086  * only be embedded in userspace pmaps at a fixed virtual address.
11087  *
11088  * @param pmap the pmap to mark as belonging to a commpage.
11089  */
11090 static void
11091 pmap_set_commpage(pmap_t pmap)
11092 {
11093 #if XNU_MONITOR
11094 	assert(!pmap_ppl_locked_down);
11095 #endif
11096 	assert(pmap->type == PMAP_TYPE_USER);
11097 	pmap->type = PMAP_TYPE_COMMPAGE;
11098 	/*
11099 	 * Free the pmap's ASID.  This pmap should not ever be directly
11100 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
11101 	 * ASID space contention but will also cause pmap_switch() to panic
11102 	 * if an attacker tries to activate this pmap.  Disable preemption to
11103 	 * accommodate the *_nopreempt spinlock in free_asid().
11104 	 */
11105 	mp_disable_preemption();
11106 	pmap_get_pt_ops(pmap)->free_id(pmap);
11107 	mp_enable_preemption();
11108 }
11109 
11110 static void
11111 pmap_update_tt3e(
11112 	pmap_t pmap,
11113 	vm_address_t address,
11114 	tt_entry_t template)
11115 {
11116 	tt_entry_t *ptep, pte;
11117 
11118 	ptep = pmap_tt3e(pmap, address);
11119 	if (ptep == NULL) {
11120 		panic("%s: no ptep?", __FUNCTION__);
11121 	}
11122 
11123 	pte = *ptep;
11124 	pte = tte_to_pa(pte) | template;
11125 	write_pte_strong(ptep, pte);
11126 }
11127 
11128 /* Note absence of non-global bit */
11129 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11130 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11131 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11132 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11133 
11134 /* Note absence of non-global bit and no-execute bit.  */
11135 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11136 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11137 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11138 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11139 
11140 void
11141 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11142     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11143 {
11144 	kern_return_t kr;
11145 	pmap_paddr_t data_pa = 0; // data address
11146 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11147 	pmap_paddr_t text_pa = 0; // text address
11148 
11149 	*kernel_data_addr = 0;
11150 	*kernel_text_addr = 0;
11151 	*user_text_addr = 0;
11152 
11153 #if XNU_MONITOR
11154 	data_pa = pmap_alloc_page_for_kern(0);
11155 	assert(data_pa);
11156 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11157 	ro_data_pa = pmap_alloc_page_for_kern(0);
11158 	assert(ro_data_pa);
11159 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11160 #if CONFIG_ARM_PFZ
11161 	text_pa = pmap_alloc_page_for_kern(0);
11162 	assert(text_pa);
11163 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11164 #endif
11165 
11166 #else /* XNU_MONITOR */
11167 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11168 	/*
11169 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11170 	 * mapped at page granularity, so a separate page for kernel RO data would not
11171 	 * be useful.
11172 	 */
11173 	ro_data_pa = data_pa;
11174 #if CONFIG_ARM_PFZ
11175 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11176 #endif
11177 
11178 #endif /* XNU_MONITOR */
11179 
11180 	/*
11181 	 * In order to avoid burning extra pages on mapping the shared page, we
11182 	 * create a dedicated pmap for the shared page.  We forcibly nest the
11183 	 * translation tables from this pmap into other pmaps.  The level we
11184 	 * will nest at depends on the MMU configuration (page size, TTBR range,
11185 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11186 	 *
11187 	 * Note that this is NOT "the nested pmap" (which is used to nest the
11188 	 * shared cache).
11189 	 *
11190 	 * Note that we update parameters of the entry for our unique needs (NG
11191 	 * entry, etc.).
11192 	 */
11193 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11194 	assert(commpage_pmap_default != NULL);
11195 	pmap_set_commpage(commpage_pmap_default);
11196 
11197 	/* The user 64-bit mappings... */
11198 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11199 	assert(kr == KERN_SUCCESS);
11200 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11201 
11202 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11203 	assert(kr == KERN_SUCCESS);
11204 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11205 #if CONFIG_ARM_PFZ
11206 	/* User mapping of comm page text section for 64 bit mapping only
11207 	 *
11208 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11209 	 * user processes to get this page mapped in, they should never call into
11210 	 * this page.
11211 	 *
11212 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11213 	 * is slid in the same L3 as the data commpage.  It is either outside the
11214 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11215 	 * it is reserved and unavailable to mach VM for future mappings.
11216 	 */
11217 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11218 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11219 
11220 	vm_map_address_t commpage_text_va = 0;
11221 
11222 	do {
11223 		int text_leaf_index = random() % num_ptes;
11224 
11225 		// Generate a VA for the commpage text with the same root and twig index as data
11226 		// comm page, but with new leaf index we've just generated.
11227 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11228 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11229 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11230 
11231 	// Assert that this is empty
11232 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11233 	assert(ptep != PT_ENTRY_NULL);
11234 	assert(*ptep == ARM_TTE_EMPTY);
11235 
11236 	// At this point, we've found the address we want to insert our comm page at
11237 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11238 	assert(kr == KERN_SUCCESS);
11239 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11240 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11241 
11242 	*user_text_addr = commpage_text_va;
11243 #endif
11244 
11245 	/* ...and the user 32-bit mappings. */
11246 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11247 	assert(kr == KERN_SUCCESS);
11248 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11249 
11250 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11251 	assert(kr == KERN_SUCCESS);
11252 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11253 #if __ARM_MIXED_PAGE_SIZE__
11254 	/**
11255 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11256 	 * new set of page tables that point to the exact same 16K shared page as
11257 	 * before. Only the first 4K of the 16K shared page is mapped since that's
11258 	 * the only part that contains relevant data.
11259 	 */
11260 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11261 	assert(commpage_pmap_4k != NULL);
11262 	pmap_set_commpage(commpage_pmap_4k);
11263 
11264 	/* The user 64-bit mappings... */
11265 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11266 	assert(kr == KERN_SUCCESS);
11267 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11268 
11269 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11270 	assert(kr == KERN_SUCCESS);
11271 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11272 
11273 	/* ...and the user 32-bit mapping. */
11274 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11275 	assert(kr == KERN_SUCCESS);
11276 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11277 
11278 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11279 	assert(kr == KERN_SUCCESS);
11280 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11281 #endif
11282 
11283 	/* For manipulation in kernel, go straight to physical page */
11284 	*kernel_data_addr = phystokv(data_pa);
11285 	assert(commpage_ro_data_kva == 0);
11286 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11287 	assert(commpage_text_kva == 0);
11288 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11289 }
11290 
11291 
11292 /*
11293  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11294  * with user controlled TTEs for regions that aren't explicitly reserved by the
11295  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11296  */
11297 #if (ARM_PGSHIFT == 14)
11298 /**
11299  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11300  * commpage completely above the maximum 32-bit userspace VA.
11301  */
11302 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11303 
11304 /**
11305  * Normally there'd be an assert to check that 64-bit devices with 64-bit
11306  * userspace VAs can nest the commpage completely above the maximum 64-bit
11307  * userpace VA, but that technically isn't true on macOS. On those systems, the
11308  * commpage lives within the userspace VA range, but is protected by the VM as
11309  * a reserved region (see vm_reserved_regions[] definition for more info).
11310  */
11311 
11312 #elif (ARM_PGSHIFT == 12)
11313 /**
11314  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11315  * above the maximum userspace VA.
11316  */
11317 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11318 #else
11319 #error Nested shared page mapping is unsupported on this config
11320 #endif
11321 
11322 MARK_AS_PMAP_TEXT kern_return_t
11323 pmap_insert_commpage_internal(
11324 	pmap_t pmap)
11325 {
11326 	kern_return_t kr = KERN_SUCCESS;
11327 	vm_offset_t commpage_vaddr;
11328 	pt_entry_t *ttep, *src_ttep;
11329 	int options = 0;
11330 	pmap_t commpage_pmap = commpage_pmap_default;
11331 
11332 	/* Validate the pmap input before accessing its data. */
11333 	validate_pmap_mutable(pmap);
11334 
11335 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11336 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11337 
11338 #if __ARM_MIXED_PAGE_SIZE__
11339 #if !__ARM_16K_PG__
11340 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11341 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11342 #endif /* !__ARM_16K_PG__ */
11343 
11344 	/* Choose the correct shared page pmap to use. */
11345 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11346 	if (pmap_page_size == 16384) {
11347 		commpage_pmap = commpage_pmap_default;
11348 	} else if (pmap_page_size == 4096) {
11349 		commpage_pmap = commpage_pmap_4k;
11350 	} else {
11351 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11352 	}
11353 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11354 
11355 #if XNU_MONITOR
11356 	options |= PMAP_OPTIONS_NOWAIT;
11357 #endif /* XNU_MONITOR */
11358 
11359 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11360 #error We assume a single page.
11361 #endif
11362 
11363 	if (pmap_is_64bit(pmap)) {
11364 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11365 	} else {
11366 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11367 	}
11368 
11369 
11370 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11371 
11372 	/*
11373 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11374 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11375 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11376 	 * to "nest".
11377 	 *
11378 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11379 	 * nesting just means inserting pointers to pre-allocated tables inside of
11380 	 * the passed in pmap to allow us to share page tables (which map the shared
11381 	 * page) for every task. This saves at least one page of memory per process
11382 	 * compared to creating new page tables in every process for mapping the
11383 	 * shared page.
11384 	 */
11385 
11386 	/**
11387 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11388 	 * page's tables into place.
11389 	 */
11390 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11391 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11392 
11393 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11394 
11395 		if (kr != KERN_SUCCESS) {
11396 #if XNU_MONITOR
11397 			if (kr == KERN_RESOURCE_SHORTAGE) {
11398 				return kr;
11399 			} else
11400 #endif
11401 			if (kr == KERN_ABORTED) {
11402 				return kr;
11403 			} else {
11404 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11405 			}
11406 		}
11407 
11408 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11409 	}
11410 
11411 	if (*ttep != ARM_PTE_EMPTY) {
11412 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11413 	}
11414 
11415 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11416 
11417 	*ttep = *src_ttep;
11418 	FLUSH_PTE_STRONG();
11419 
11420 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11421 
11422 	return kr;
11423 }
11424 
11425 static void
11426 pmap_unmap_commpage(
11427 	pmap_t pmap)
11428 {
11429 	pt_entry_t *ttep;
11430 	vm_offset_t commpage_vaddr;
11431 	pmap_t commpage_pmap = commpage_pmap_default;
11432 
11433 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11434 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11435 
11436 #if __ARM_MIXED_PAGE_SIZE__
11437 #if !__ARM_16K_PG__
11438 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11439 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11440 #endif /* !__ARM_16K_PG__ */
11441 
11442 	/* Choose the correct shared page pmap to use. */
11443 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11444 	if (pmap_page_size == 16384) {
11445 		commpage_pmap = commpage_pmap_default;
11446 	} else if (pmap_page_size == 4096) {
11447 		commpage_pmap = commpage_pmap_4k;
11448 	} else {
11449 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11450 	}
11451 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11452 
11453 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11454 #error We assume a single page.
11455 #endif
11456 
11457 	if (pmap_is_64bit(pmap)) {
11458 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11459 	} else {
11460 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11461 	}
11462 
11463 
11464 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11465 
11466 	if (ttep == NULL) {
11467 		return;
11468 	}
11469 
11470 	/* It had better be mapped to the shared page. */
11471 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11472 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11473 	}
11474 
11475 	*ttep = ARM_TTE_EMPTY;
11476 	FLUSH_PTE_STRONG();
11477 
11478 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11479 	sync_tlb_flush();
11480 }
11481 
11482 void
11483 pmap_insert_commpage(
11484 	pmap_t pmap)
11485 {
11486 	kern_return_t kr = KERN_FAILURE;
11487 #if XNU_MONITOR
11488 	do {
11489 		kr = pmap_insert_commpage_ppl(pmap);
11490 
11491 		if (kr == KERN_RESOURCE_SHORTAGE) {
11492 			pmap_alloc_page_for_ppl(0);
11493 		}
11494 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11495 
11496 	pmap_ledger_check_balance(pmap);
11497 #else
11498 	do {
11499 		kr = pmap_insert_commpage_internal(pmap);
11500 	} while (kr == KERN_ABORTED);
11501 #endif
11502 
11503 	if (kr != KERN_SUCCESS) {
11504 		panic("%s: failed to insert the shared page, kr=%d, "
11505 		    "pmap=%p",
11506 		    __FUNCTION__, kr,
11507 		    pmap);
11508 	}
11509 }
11510 
11511 static boolean_t
11512 pmap_is_64bit(
11513 	pmap_t pmap)
11514 {
11515 	return pmap->is_64bit;
11516 }
11517 
11518 bool
11519 pmap_is_exotic(
11520 	pmap_t pmap __unused)
11521 {
11522 	return false;
11523 }
11524 
11525 
11526 /* ARMTODO -- an implementation that accounts for
11527  * holes in the physical map, if any.
11528  */
11529 boolean_t
11530 pmap_valid_page(
11531 	ppnum_t pn)
11532 {
11533 	return pa_valid(ptoa(pn));
11534 }
11535 
11536 boolean_t
11537 pmap_bootloader_page(
11538 	ppnum_t pn)
11539 {
11540 	pmap_paddr_t paddr = ptoa(pn);
11541 
11542 	if (pa_valid(paddr)) {
11543 		return FALSE;
11544 	}
11545 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11546 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11547 }
11548 
11549 MARK_AS_PMAP_TEXT boolean_t
11550 pmap_is_empty_internal(
11551 	pmap_t pmap,
11552 	vm_map_offset_t va_start,
11553 	vm_map_offset_t va_end)
11554 {
11555 	vm_map_offset_t block_start, block_end;
11556 	tt_entry_t *tte_p;
11557 
11558 	if (pmap == NULL) {
11559 		return TRUE;
11560 	}
11561 
11562 	validate_pmap(pmap);
11563 
11564 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11565 	unsigned int initial_not_in_kdp = not_in_kdp;
11566 
11567 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11568 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11569 	}
11570 
11571 
11572 	/* TODO: This will be faster if we increment ttep at each level. */
11573 	block_start = va_start;
11574 
11575 	while (block_start < va_end) {
11576 		pt_entry_t     *bpte_p, *epte_p;
11577 		pt_entry_t     *pte_p;
11578 
11579 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11580 		if (block_end > va_end) {
11581 			block_end = va_end;
11582 		}
11583 
11584 		tte_p = pmap_tte(pmap, block_start);
11585 		if ((tte_p != PT_ENTRY_NULL)
11586 		    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11587 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11588 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11589 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11590 
11591 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11592 				if (*pte_p != ARM_PTE_EMPTY) {
11593 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11594 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11595 					}
11596 					return FALSE;
11597 				}
11598 			}
11599 		}
11600 		block_start = block_end;
11601 	}
11602 
11603 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11604 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11605 	}
11606 
11607 	return TRUE;
11608 }
11609 
11610 boolean_t
11611 pmap_is_empty(
11612 	pmap_t pmap,
11613 	vm_map_offset_t va_start,
11614 	vm_map_offset_t va_end)
11615 {
11616 #if XNU_MONITOR
11617 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11618 #else
11619 	return pmap_is_empty_internal(pmap, va_start, va_end);
11620 #endif
11621 }
11622 
11623 vm_map_offset_t
11624 pmap_max_offset(
11625 	boolean_t               is64,
11626 	unsigned int    option)
11627 {
11628 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11629 }
11630 
11631 vm_map_offset_t
11632 pmap_max_64bit_offset(
11633 	__unused unsigned int option)
11634 {
11635 	vm_map_offset_t max_offset_ret = 0;
11636 
11637 #if defined(__arm64__)
11638 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11639 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11640 		max_offset_ret = arm64_pmap_max_offset_default;
11641 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11642 		max_offset_ret = min_max_offset;
11643 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11644 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11645 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11646 		if (arm64_pmap_max_offset_default) {
11647 			max_offset_ret = arm64_pmap_max_offset_default;
11648 		} else if (max_mem > 0xC0000000) {
11649 			// devices with > 3GB of memory
11650 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11651 		} else if (max_mem > 0x40000000) {
11652 			// devices with > 1GB and <= 3GB of memory
11653 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11654 		} else {
11655 			// devices with <= 1 GB of memory
11656 			max_offset_ret = min_max_offset;
11657 		}
11658 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11659 		if (arm64_pmap_max_offset_default) {
11660 			// Allow the boot-arg to override jumbo size
11661 			max_offset_ret = arm64_pmap_max_offset_default;
11662 		} else {
11663 			max_offset_ret = MACH_VM_MAX_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11664 		}
11665 	} else {
11666 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11667 	}
11668 
11669 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11670 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11671 		assert(max_offset_ret >= min_max_offset);
11672 	}
11673 #else
11674 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11675 #endif
11676 
11677 	return max_offset_ret;
11678 }
11679 
11680 vm_map_offset_t
11681 pmap_max_32bit_offset(
11682 	unsigned int option)
11683 {
11684 	vm_map_offset_t max_offset_ret = 0;
11685 
11686 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11687 		max_offset_ret = arm_pmap_max_offset_default;
11688 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11689 		max_offset_ret = VM_MAX_ADDRESS;
11690 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11691 		max_offset_ret = VM_MAX_ADDRESS;
11692 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11693 		if (arm_pmap_max_offset_default) {
11694 			max_offset_ret = arm_pmap_max_offset_default;
11695 		} else if (max_mem > 0x20000000) {
11696 			max_offset_ret = VM_MAX_ADDRESS;
11697 		} else {
11698 			max_offset_ret = VM_MAX_ADDRESS;
11699 		}
11700 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11701 		max_offset_ret = VM_MAX_ADDRESS;
11702 	} else {
11703 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11704 	}
11705 
11706 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11707 	return max_offset_ret;
11708 }
11709 
11710 #if CONFIG_DTRACE
11711 /*
11712  * Constrain DTrace copyin/copyout actions
11713  */
11714 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11715 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11716 
11717 kern_return_t
11718 dtrace_copyio_preflight(
11719 	__unused addr64_t va)
11720 {
11721 	if (current_map() == kernel_map) {
11722 		return KERN_FAILURE;
11723 	} else {
11724 		return KERN_SUCCESS;
11725 	}
11726 }
11727 
11728 kern_return_t
11729 dtrace_copyio_postflight(
11730 	__unused addr64_t va)
11731 {
11732 	return KERN_SUCCESS;
11733 }
11734 #endif /* CONFIG_DTRACE */
11735 
11736 
11737 void
11738 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11739 {
11740 }
11741 
11742 
11743 void
11744 pmap_flush(
11745 	__unused pmap_flush_context *cpus_to_flush)
11746 {
11747 	/* not implemented yet */
11748 	return;
11749 }
11750 
11751 #if XNU_MONITOR
11752 
11753 /*
11754  * Enforce that the address range described by kva and nbytes is not currently
11755  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
11756  * unintentionally writing to PPL-owned memory.
11757  */
11758 void
11759 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11760 {
11761 	vm_offset_t end;
11762 	if (os_add_overflow(kva, nbytes, &end)) {
11763 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11764 	}
11765 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11766 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11767 		pp_attr_t attr;
11768 		unsigned int pai = pa_index(pa);
11769 		if (ckva == phystokv(pa)) {
11770 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11771 		}
11772 		do {
11773 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11774 			if (attr & PP_ATTR_MONITOR) {
11775 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11776 			}
11777 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11778 	}
11779 }
11780 
11781 void
11782 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11783 {
11784 	vm_offset_t end;
11785 	if (os_add_overflow(kva, nbytes, &end)) {
11786 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11787 	}
11788 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11789 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11790 
11791 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11792 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11793 		}
11794 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11795 		ppattr_pa_clear_no_monitor(pa);
11796 	}
11797 }
11798 
11799 /**
11800  * Lock down a page, making all mappings read-only, and preventing further
11801  * mappings or removal of this particular kva's mapping. Effectively, it makes
11802  * the physical page at kva immutable (see the ppl_writable parameter for an
11803  * exception to this).
11804  *
11805  * @param kva Valid address to any mapping of the physical page to lockdown.
11806  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11807  * @param ppl_writable True if the PPL should still be able to write to the page
11808  *                     using the physical aperture mapping. False will make the
11809  *                     page read-only for both the kernel and PPL in the
11810  *                     physical aperture.
11811  */
11812 
11813 MARK_AS_PMAP_TEXT static void
11814 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11815 {
11816 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11817 }
11818 
11819 /**
11820  * Lock down a page, giving all mappings the specified maximum permissions, and
11821  * preventing further mappings or removal of this particular kva's mapping.
11822  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11823  * parameter for an exception to this).
11824  *
11825  * @param kva Valid address to any mapping of the physical page to lockdown.
11826  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11827  * @param ppl_writable True if the PPL should still be able to write to the page
11828  *                     using the physical aperture mapping. False will make the
11829  *                     page read-only for both the kernel and PPL in the
11830  *                     physical aperture.
11831  * @param prot Maximum permissions to allow in existing alias mappings
11832  */
11833 MARK_AS_PMAP_TEXT static void
11834 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11835 {
11836 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11837 	const unsigned int pai = pa_index(pa);
11838 
11839 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11840 	pvh_lock(pai);
11841 	pv_entry_t **pvh = pai_to_pvh(pai);
11842 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11843 
11844 	if (__improbable(ppattr_pa_test_monitor(pa))) {
11845 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11846 	}
11847 
11848 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11849 		panic("%s: %#lx already locked down/executable (%#llx)",
11850 		    __func__, kva, (uint64_t)pvh_flags);
11851 	}
11852 
11853 
11854 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11855 
11856 	/* Update the physical aperture mapping to prevent kernel write access. */
11857 	const unsigned int new_xprr_perm =
11858 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11859 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11860 
11861 	pvh_unlock(pai);
11862 
11863 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11864 
11865 	/**
11866 	 * Double-check that the mapping didn't change physical addresses before the
11867 	 * LOCKDOWN flag was set (there is a brief window between the above
11868 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11869 	 *
11870 	 * This doesn't solve the ABA problem, but this doesn't have to since once
11871 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
11872 	 * page without the LOCKDOWN flag already set (so any future mappings can
11873 	 * only be RO, and no existing mappings can be removed).
11874 	 */
11875 	if (kvtophys_nofail(kva) != pa) {
11876 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11877 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11878 	}
11879 }
11880 
11881 /**
11882  * Helper for releasing a page from being locked down to the PPL, making it writable to the
11883  * kernel once again.
11884  *
11885  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11886  *       to unlockdown a page that was never locked down, will panic.
11887  *
11888  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
11889  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11890  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11891  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11892  *                     deviation will result in a panic.
11893  */
11894 MARK_AS_PMAP_TEXT static void
11895 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11896 {
11897 	pvh_assert_locked(pai);
11898 	pv_entry_t **pvh = pai_to_pvh(pai);
11899 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11900 
11901 	if (__improbable(!(pvh_flags & lockdown_flag))) {
11902 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11903 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11904 	}
11905 
11906 
11907 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11908 
11909 	/* Restore the pre-lockdown physical aperture mapping permissions. */
11910 	const unsigned int old_xprr_perm =
11911 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11912 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11913 }
11914 
11915 /**
11916  * Release a page from being locked down to the PPL, making it writable to the
11917  * kernel once again.
11918  *
11919  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11920  *       to unlockdown a page that was never locked down, will panic.
11921  *
11922  * @param kva Valid address to any mapping of the physical page to unlockdown.
11923  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11924  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11925  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11926  *                     deviation will result in a panic.
11927  */
11928 MARK_AS_PMAP_TEXT static void
11929 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11930 {
11931 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11932 	const unsigned int pai = pa_index(pa);
11933 
11934 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11935 	pvh_lock(pai);
11936 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11937 	pvh_unlock(pai);
11938 }
11939 
11940 #else /* XNU_MONITOR */
11941 
11942 void __unused
11943 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11944 {
11945 }
11946 
11947 void __unused
11948 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11949 {
11950 }
11951 
11952 #endif /* !XNU_MONITOR */
11953 
11954 
11955 MARK_AS_PMAP_TEXT static inline void
11956 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11957 {
11958 #if XNU_MONITOR
11959 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11960 #else
11961 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11962 #endif
11963 }
11964 
11965 MARK_AS_PMAP_TEXT static inline void
11966 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11967 {
11968 #if XNU_MONITOR
11969 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11970 #else
11971 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11972 #endif
11973 }
11974 
11975 /**
11976  * Perform basic validation checks on the destination only and
11977  * corresponding offset/sizes prior to writing to a read only allocation.
11978  *
11979  * @note Should be called before writing to an allocation from the read
11980  * only allocator.
11981  *
11982  * @param zid The ID of the zone the allocation belongs to.
11983  * @param va VA of element being modified (destination).
11984  * @param offset Offset being written to, in the element.
11985  * @param new_data_size Size of modification.
11986  *
11987  */
11988 
11989 MARK_AS_PMAP_TEXT static void
11990 pmap_ro_zone_validate_element_dst(
11991 	zone_id_t           zid,
11992 	vm_offset_t         va,
11993 	vm_offset_t         offset,
11994 	vm_size_t           new_data_size)
11995 {
11996 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11997 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11998 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11999 	}
12000 
12001 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12002 
12003 	/* Check element is from correct zone and properly aligned */
12004 	zone_require_ro(zid, elem_size, (void*)va);
12005 
12006 	if (__improbable(new_data_size > (elem_size - offset))) {
12007 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
12008 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12009 	}
12010 	if (__improbable(offset >= elem_size)) {
12011 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
12012 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12013 	}
12014 }
12015 
12016 
12017 /**
12018  * Perform basic validation checks on the source, destination and
12019  * corresponding offset/sizes prior to writing to a read only allocation.
12020  *
12021  * @note Should be called before writing to an allocation from the read
12022  * only allocator.
12023  *
12024  * @param zid The ID of the zone the allocation belongs to.
12025  * @param va VA of element being modified (destination).
12026  * @param offset Offset being written to, in the element.
12027  * @param new_data Pointer to new data (source).
12028  * @param new_data_size Size of modification.
12029  *
12030  */
12031 
12032 MARK_AS_PMAP_TEXT static void
12033 pmap_ro_zone_validate_element(
12034 	zone_id_t           zid,
12035 	vm_offset_t         va,
12036 	vm_offset_t         offset,
12037 	const vm_offset_t   new_data,
12038 	vm_size_t           new_data_size)
12039 {
12040 	vm_offset_t sum = 0;
12041 
12042 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12043 		panic("%s: Integer addition overflow %p + %lu = %lu",
12044 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12045 	}
12046 
12047 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12048 }
12049 
12050 /**
12051  * Ensure that physical page is locked down and pinned, before writing to it.
12052  *
12053  * @note Should be called before writing to an allocation from the read
12054  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12055  * ensure that it is called after the modification.
12056  *
12057  *
12058  * @param pa Physical address of the element being modified.
12059  * @param va Virtual address of element being modified.
12060  * @param size Size of the modification.
12061  *
12062  */
12063 
12064 MARK_AS_PMAP_TEXT static void
12065 pmap_ro_zone_lock_phy_page(
12066 	const pmap_paddr_t  pa,
12067 	vm_offset_t         va,
12068 	vm_size_t           size)
12069 {
12070 	const unsigned int pai = pa_index(pa);
12071 	pvh_lock(pai);
12072 
12073 	/* Ensure that the physical page is locked down */
12074 #if XNU_MONITOR
12075 	pv_entry_t **pvh = pai_to_pvh(pai);
12076 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12077 		panic("%s: Physical page not locked down %llx", __func__, pa);
12078 	}
12079 #endif /* XNU_MONITOR */
12080 
12081 	/* Ensure page can't become PPL-owned memory before the memcpy occurs */
12082 	pmap_pin_kernel_pages(va, size);
12083 }
12084 
12085 /**
12086  * Unlock and unpin physical page after writing to it.
12087  *
12088  * @note Should be called after writing to an allocation from the read
12089  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12090  * ensure that it has been called prior to the modification.
12091  *
12092  * @param pa Physical address of the element that was modified.
12093  * @param va Virtual address of element that was modified.
12094  * @param size Size of the modification.
12095  *
12096  */
12097 
12098 MARK_AS_PMAP_TEXT static void
12099 pmap_ro_zone_unlock_phy_page(
12100 	const pmap_paddr_t  pa,
12101 	vm_offset_t         va,
12102 	vm_size_t           size)
12103 {
12104 	const unsigned int pai = pa_index(pa);
12105 	pmap_unpin_kernel_pages(va, size);
12106 	pvh_unlock(pai);
12107 }
12108 
12109 /**
12110  * Function to copy kauth_cred from new_data to kv.
12111  * Function defined in "kern_prot.c"
12112  *
12113  * @note Will be removed upon completion of
12114  * <rdar://problem/72635194> Compiler PAC support for memcpy.
12115  *
12116  * @param kv Address to copy new data to.
12117  * @param new_data Pointer to new data.
12118  *
12119  */
12120 
12121 extern void
12122 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12123 
12124 /**
12125  * Zalloc-specific memcpy that writes through the physical aperture
12126  * and ensures the element being modified is from a read-only zone.
12127  *
12128  * @note Designed to work only with the zone allocator's read-only submap.
12129  *
12130  * @param zid The ID of the zone to allocate from.
12131  * @param va VA of element to be modified.
12132  * @param offset Offset from element.
12133  * @param new_data Pointer to new data.
12134  * @param new_data_size	Size of modification.
12135  *
12136  */
12137 
12138 void
12139 pmap_ro_zone_memcpy(
12140 	zone_id_t           zid,
12141 	vm_offset_t         va,
12142 	vm_offset_t         offset,
12143 	const vm_offset_t   new_data,
12144 	vm_size_t           new_data_size)
12145 {
12146 #if XNU_MONITOR
12147 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12148 #else /* XNU_MONITOR */
12149 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12150 #endif /* XNU_MONITOR */
12151 }
12152 
12153 MARK_AS_PMAP_TEXT void
12154 pmap_ro_zone_memcpy_internal(
12155 	zone_id_t             zid,
12156 	vm_offset_t           va,
12157 	vm_offset_t           offset,
12158 	const vm_offset_t     new_data,
12159 	vm_size_t             new_data_size)
12160 {
12161 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12162 
12163 	if (!new_data || new_data_size == 0) {
12164 		return;
12165 	}
12166 
12167 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12168 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12169 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12170 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12171 }
12172 
12173 /**
12174  * Zalloc-specific function to atomically mutate fields of an element that
12175  * belongs to a read-only zone, via the physcial aperture.
12176  *
12177  * @note Designed to work only with the zone allocator's read-only submap.
12178  *
12179  * @param zid The ID of the zone the element belongs to.
12180  * @param va VA of element to be modified.
12181  * @param offset Offset in element.
12182  * @param op Atomic operation to perform.
12183  * @param value	Mutation value.
12184  *
12185  */
12186 
12187 uint64_t
12188 pmap_ro_zone_atomic_op(
12189 	zone_id_t             zid,
12190 	vm_offset_t           va,
12191 	vm_offset_t           offset,
12192 	zro_atomic_op_t       op,
12193 	uint64_t              value)
12194 {
12195 #if XNU_MONITOR
12196 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12197 #else /* XNU_MONITOR */
12198 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12199 #endif /* XNU_MONITOR */
12200 }
12201 
12202 MARK_AS_PMAP_TEXT uint64_t
12203 pmap_ro_zone_atomic_op_internal(
12204 	zone_id_t             zid,
12205 	vm_offset_t           va,
12206 	vm_offset_t           offset,
12207 	zro_atomic_op_t       op,
12208 	uint64_t              value)
12209 {
12210 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12211 	vm_size_t value_size = op & 0xf;
12212 
12213 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12214 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
12215 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12216 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12217 
12218 	return value;
12219 }
12220 
12221 /**
12222  * bzero for allocations from read only zones, that writes through the
12223  * physical aperture.
12224  *
12225  * @note This is called by the zfree path of all allocations from read
12226  * only zones.
12227  *
12228  * @param zid The ID of the zone the allocation belongs to.
12229  * @param va VA of element to be zeroed.
12230  * @param offset Offset in the element.
12231  * @param size	Size of allocation.
12232  *
12233  */
12234 
12235 void
12236 pmap_ro_zone_bzero(
12237 	zone_id_t       zid,
12238 	vm_offset_t     va,
12239 	vm_offset_t     offset,
12240 	vm_size_t       size)
12241 {
12242 #if XNU_MONITOR
12243 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12244 #else /* XNU_MONITOR */
12245 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
12246 #endif /* XNU_MONITOR */
12247 }
12248 
12249 MARK_AS_PMAP_TEXT void
12250 pmap_ro_zone_bzero_internal(
12251 	zone_id_t       zid,
12252 	vm_offset_t     va,
12253 	vm_offset_t     offset,
12254 	vm_size_t       size)
12255 {
12256 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12257 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12258 	pmap_ro_zone_lock_phy_page(pa, va, size);
12259 	bzero((void*)phystokv(pa), size);
12260 	pmap_ro_zone_unlock_phy_page(pa, va, size);
12261 }
12262 
12263 /**
12264  * Removes write access from the Physical Aperture.
12265  *
12266  * @note For non-PPL devices, it simply makes all virtual mappings RO.
12267  * @note Designed to work only with the zone allocator's read-only submap.
12268  *
12269  * @param va VA of the page to restore write access to.
12270  *
12271  */
12272 MARK_AS_PMAP_TEXT static void
12273 pmap_phys_write_disable(vm_address_t va)
12274 {
12275 #if XNU_MONITOR
12276 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12277 #else /* XNU_MONITOR */
12278 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12279 #endif /* XNU_MONITOR */
12280 }
12281 
12282 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
12283 
12284 MARK_AS_PMAP_TEXT mach_vm_size_t
12285 pmap_query_resident_internal(
12286 	pmap_t                  pmap,
12287 	vm_map_address_t        start,
12288 	vm_map_address_t        end,
12289 	mach_vm_size_t          *compressed_bytes_p)
12290 {
12291 	mach_vm_size_t  resident_bytes = 0;
12292 	mach_vm_size_t  compressed_bytes = 0;
12293 
12294 	pt_entry_t     *bpte, *epte;
12295 	pt_entry_t     *pte_p;
12296 	tt_entry_t     *tte_p;
12297 
12298 	if (pmap == NULL) {
12299 		return PMAP_RESIDENT_INVALID;
12300 	}
12301 
12302 	validate_pmap(pmap);
12303 
12304 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12305 
12306 	/* Ensure that this request is valid, and addresses exactly one TTE. */
12307 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12308 	    (end % pt_attr_page_size(pt_attr)))) {
12309 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12310 	}
12311 
12312 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12313 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12314 	}
12315 
12316 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12317 	tte_p = pmap_tte(pmap, start);
12318 	if (tte_p == (tt_entry_t *) NULL) {
12319 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
12320 		return PMAP_RESIDENT_INVALID;
12321 	}
12322 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12323 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
12324 		bpte = &pte_p[pte_index(pt_attr, start)];
12325 		epte = &pte_p[pte_index(pt_attr, end)];
12326 
12327 		for (; bpte < epte; bpte++) {
12328 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12329 				compressed_bytes += pt_attr_page_size(pt_attr);
12330 			} else if (pa_valid(pte_to_pa(*bpte))) {
12331 				resident_bytes += pt_attr_page_size(pt_attr);
12332 			}
12333 		}
12334 	}
12335 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12336 
12337 	if (compressed_bytes_p) {
12338 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12339 		*compressed_bytes_p += compressed_bytes;
12340 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12341 	}
12342 
12343 	return resident_bytes;
12344 }
12345 
12346 mach_vm_size_t
12347 pmap_query_resident(
12348 	pmap_t                  pmap,
12349 	vm_map_address_t        start,
12350 	vm_map_address_t        end,
12351 	mach_vm_size_t          *compressed_bytes_p)
12352 {
12353 	mach_vm_size_t          total_resident_bytes;
12354 	mach_vm_size_t          compressed_bytes;
12355 	vm_map_address_t        va;
12356 
12357 
12358 	if (pmap == PMAP_NULL) {
12359 		if (compressed_bytes_p) {
12360 			*compressed_bytes_p = 0;
12361 		}
12362 		return 0;
12363 	}
12364 
12365 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12366 
12367 	total_resident_bytes = 0;
12368 	compressed_bytes = 0;
12369 
12370 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12371 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12372 	    VM_KERNEL_ADDRHIDE(end));
12373 
12374 	va = start;
12375 	while (va < end) {
12376 		vm_map_address_t l;
12377 		mach_vm_size_t resident_bytes;
12378 
12379 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12380 
12381 		if (l > end) {
12382 			l = end;
12383 		}
12384 #if XNU_MONITOR
12385 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12386 #else
12387 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12388 #endif
12389 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12390 			break;
12391 		}
12392 
12393 		total_resident_bytes += resident_bytes;
12394 
12395 		va = l;
12396 	}
12397 
12398 	if (compressed_bytes_p) {
12399 		*compressed_bytes_p = compressed_bytes;
12400 	}
12401 
12402 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12403 	    total_resident_bytes);
12404 
12405 	return total_resident_bytes;
12406 }
12407 
12408 #if MACH_ASSERT
12409 static void
12410 pmap_check_ledgers(
12411 	pmap_t pmap)
12412 {
12413 	int     pid;
12414 	char    *procname;
12415 
12416 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12417 		/*
12418 		 * This pmap was not or is no longer fully associated
12419 		 * with a task (e.g. the old pmap after a fork()/exec() or
12420 		 * spawn()).  Its "ledger" still points at a task that is
12421 		 * now using a different (and active) address space, so
12422 		 * we can't check that all the pmap ledgers are balanced here.
12423 		 *
12424 		 * If the "pid" is set, that means that we went through
12425 		 * pmap_set_process() in task_terminate_internal(), so
12426 		 * this task's ledger should not have been re-used and
12427 		 * all the pmap ledgers should be back to 0.
12428 		 */
12429 		return;
12430 	}
12431 
12432 	pid = pmap->pmap_pid;
12433 	procname = pmap->pmap_procname;
12434 
12435 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12436 }
12437 #endif /* MACH_ASSERT */
12438 
12439 void
12440 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12441 {
12442 }
12443 
12444 /**
12445  * The minimum shared region nesting size is used by the VM to determine when to
12446  * break up large mappings to nested regions. The smallest size that these
12447  * mappings can be broken into is determined by what page table level those
12448  * regions are being nested in at and the size of the page tables.
12449  *
12450  * For instance, if a nested region is nesting at L2 for a process utilizing
12451  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12452  * block entry).
12453  *
12454  * @param pmap The target pmap to determine the block size based on whether it's
12455  *             using 16KB or 4KB page tables.
12456  */
12457 uint64_t
12458 pmap_shared_region_size_min(__unused pmap_t pmap)
12459 {
12460 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12461 
12462 	/**
12463 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12464 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12465 	 * point to shared L3 page tables in the shared region pmap.
12466 	 */
12467 	return pt_attr_twig_size(pt_attr);
12468 }
12469 
12470 boolean_t
12471 pmap_enforces_execute_only(
12472 	pmap_t pmap)
12473 {
12474 	return pmap != kernel_pmap;
12475 }
12476 
12477 MARK_AS_PMAP_TEXT void
12478 pmap_set_vm_map_cs_enforced_internal(
12479 	pmap_t pmap,
12480 	bool new_value)
12481 {
12482 	validate_pmap_mutable(pmap);
12483 	pmap->pmap_vm_map_cs_enforced = new_value;
12484 }
12485 
12486 void
12487 pmap_set_vm_map_cs_enforced(
12488 	pmap_t pmap,
12489 	bool new_value)
12490 {
12491 #if XNU_MONITOR
12492 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12493 #else
12494 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12495 #endif
12496 }
12497 
12498 extern int cs_process_enforcement_enable;
12499 bool
12500 pmap_get_vm_map_cs_enforced(
12501 	pmap_t pmap)
12502 {
12503 	if (cs_process_enforcement_enable) {
12504 		return true;
12505 	}
12506 	return pmap->pmap_vm_map_cs_enforced;
12507 }
12508 
12509 MARK_AS_PMAP_TEXT void
12510 pmap_set_jit_entitled_internal(
12511 	__unused pmap_t pmap)
12512 {
12513 	return;
12514 }
12515 
12516 void
12517 pmap_set_jit_entitled(
12518 	pmap_t pmap)
12519 {
12520 #if XNU_MONITOR
12521 	pmap_set_jit_entitled_ppl(pmap);
12522 #else
12523 	pmap_set_jit_entitled_internal(pmap);
12524 #endif
12525 }
12526 
12527 bool
12528 pmap_get_jit_entitled(
12529 	__unused pmap_t pmap)
12530 {
12531 	return false;
12532 }
12533 
12534 MARK_AS_PMAP_TEXT void
12535 pmap_set_tpro_internal(
12536 	__unused pmap_t pmap)
12537 {
12538 	return;
12539 }
12540 
12541 void
12542 pmap_set_tpro(
12543 	pmap_t pmap)
12544 {
12545 #if XNU_MONITOR
12546 	pmap_set_tpro_ppl(pmap);
12547 #else /* XNU_MONITOR */
12548 	pmap_set_tpro_internal(pmap);
12549 #endif /* XNU_MONITOR */
12550 }
12551 
12552 bool
12553 pmap_get_tpro(
12554 	__unused pmap_t pmap)
12555 {
12556 	return false;
12557 }
12558 
12559 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12560 
12561 MARK_AS_PMAP_TEXT kern_return_t
12562 pmap_query_page_info_internal(
12563 	pmap_t          pmap,
12564 	vm_map_offset_t va,
12565 	int             *disp_p)
12566 {
12567 	pmap_paddr_t    pa;
12568 	int             disp;
12569 	unsigned int    pai;
12570 	pt_entry_t      *pte_p, pte;
12571 	pv_entry_t      **pv_h, *pve_p;
12572 
12573 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12574 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12575 		*disp_p = 0;
12576 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12577 		return KERN_INVALID_ARGUMENT;
12578 	}
12579 
12580 	validate_pmap(pmap);
12581 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12582 
12583 try_again:
12584 	disp = 0;
12585 	pte_p = pmap_pte(pmap, va);
12586 	if (pte_p == PT_ENTRY_NULL) {
12587 		goto done;
12588 	}
12589 	pte = *(volatile pt_entry_t*)pte_p;
12590 	pa = pte_to_pa(pte);
12591 	if (pa == 0) {
12592 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12593 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12594 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12595 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12596 			}
12597 		}
12598 	} else {
12599 		disp |= PMAP_QUERY_PAGE_PRESENT;
12600 		pai = pa_index(pa);
12601 		if (!pa_valid(pa)) {
12602 			goto done;
12603 		}
12604 		pvh_lock(pai);
12605 		if (pte != *(volatile pt_entry_t*)pte_p) {
12606 			/* something changed: try again */
12607 			pvh_unlock(pai);
12608 			pmap_query_page_info_retries++;
12609 			goto try_again;
12610 		}
12611 		pv_h = pai_to_pvh(pai);
12612 		pve_p = PV_ENTRY_NULL;
12613 		int pve_ptep_idx = 0;
12614 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12615 			pve_p = pvh_pve_list(pv_h);
12616 			while (pve_p != PV_ENTRY_NULL &&
12617 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12618 				pve_p = pve_next(pve_p);
12619 			}
12620 		}
12621 
12622 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12623 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12624 		} else if (ppattr_test_reusable(pai)) {
12625 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12626 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12627 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12628 		}
12629 		pvh_unlock(pai);
12630 	}
12631 
12632 done:
12633 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12634 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12635 	*disp_p = disp;
12636 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12637 	return KERN_SUCCESS;
12638 }
12639 
12640 kern_return_t
12641 pmap_query_page_info(
12642 	pmap_t          pmap,
12643 	vm_map_offset_t va,
12644 	int             *disp_p)
12645 {
12646 #if XNU_MONITOR
12647 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12648 #else
12649 	return pmap_query_page_info_internal(pmap, va, disp_p);
12650 #endif
12651 }
12652 
12653 
12654 
12655 uint32_t
12656 pmap_user_va_bits(pmap_t pmap __unused)
12657 {
12658 #if __ARM_MIXED_PAGE_SIZE__
12659 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12660 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12661 #else
12662 	return 64 - T0SZ_BOOT;
12663 #endif
12664 }
12665 
12666 uint32_t
12667 pmap_kernel_va_bits(void)
12668 {
12669 	return 64 - T1SZ_BOOT;
12670 }
12671 
12672 static vm_map_size_t
12673 pmap_user_va_size(pmap_t pmap)
12674 {
12675 	return 1ULL << pmap_user_va_bits(pmap);
12676 }
12677 
12678 
12679 
12680 bool
12681 pmap_in_ppl(void)
12682 {
12683 	// Unsupported
12684 	return false;
12685 }
12686 
12687 __attribute__((__noreturn__))
12688 void
12689 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12690 {
12691 	panic("%s called on an unsupported platform.", __FUNCTION__);
12692 }
12693 
12694 void *
12695 pmap_claim_reserved_ppl_page(void)
12696 {
12697 	// Unsupported
12698 	return NULL;
12699 }
12700 
12701 void
12702 pmap_free_reserved_ppl_page(void __unused *kva)
12703 {
12704 	// Unsupported
12705 }
12706 
12707 
12708 #if PMAP_CS_PPL_MONITOR
12709 
12710 /* Immutable part of the trust cache runtime */
12711 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12712 
12713 /* Mutable part of the trust cache runtime */
12714 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12715 
12716 /* Lock for the trust cache runtime */
12717 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12718 
12719 MARK_AS_PMAP_TEXT kern_return_t
12720 pmap_check_trust_cache_runtime_for_uuid_internal(
12721 	const uint8_t check_uuid[kUUIDSize])
12722 {
12723 	kern_return_t ret = KERN_DENIED;
12724 
12725 	if (amfi->TrustCache.version < 3) {
12726 		/* AMFI change hasn't landed in the build */
12727 		pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12728 		return KERN_NOT_SUPPORTED;
12729 	}
12730 
12731 	/* Lock the runtime as shared */
12732 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12733 
12734 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12735 		&ppl_trust_cache_rt,
12736 		check_uuid,
12737 		NULL);
12738 
12739 	/* Unlock the runtime */
12740 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12741 
12742 	if (tc_ret.error == kTCReturnSuccess) {
12743 		ret = KERN_SUCCESS;
12744 	} else if (tc_ret.error == kTCReturnNotFound) {
12745 		ret = KERN_NOT_FOUND;
12746 	} else {
12747 		ret = KERN_FAILURE;
12748 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12749 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12750 	}
12751 
12752 	return ret;
12753 }
12754 
12755 kern_return_t
12756 pmap_check_trust_cache_runtime_for_uuid(
12757 	const uint8_t check_uuid[kUUIDSize])
12758 {
12759 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12760 }
12761 
12762 MARK_AS_PMAP_TEXT kern_return_t
12763 pmap_load_trust_cache_with_type_internal(
12764 	TCType_t type,
12765 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12766 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12767 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12768 {
12769 	kern_return_t ret = KERN_DENIED;
12770 	pmap_img4_payload_t *payload = NULL;
12771 	size_t img4_payload_len = 0;
12772 	size_t payload_len_aligned = 0;
12773 	size_t manifest_len_aligned = 0;
12774 
12775 	/* Ignore the auxiliary manifest until we add support for it */
12776 	(void)img4_aux_manifest;
12777 	(void)img4_aux_manifest_len;
12778 
12779 
12780 #if PMAP_CS_INCLUDE_CODE_SIGNING
12781 	if (pmap_cs) {
12782 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12783 			panic("trust cache type not loadable from interface: %u", type);
12784 		} else if (type >= kTCTypeTotal) {
12785 			panic("attempted to load an unsupported trust cache type: %u", type);
12786 		}
12787 
12788 		/* Validate entitlement for the calling process */
12789 		if (TCTypeConfig[type].entitlementValue != NULL) {
12790 			const bool entitlement_satisfied = check_entitlement_pmap(
12791 				NULL,
12792 				"com.apple.private.pmap.load-trust-cache",
12793 				TCTypeConfig[type].entitlementValue,
12794 				false,
12795 				true);
12796 
12797 			if (entitlement_satisfied == false) {
12798 				panic("attempted to load trust cache without entitlement: %u", type);
12799 			}
12800 		}
12801 	}
12802 #endif
12803 
12804 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12805 	ret = pmap_reserve_ppl_page();
12806 	if (ret != KERN_SUCCESS) {
12807 		if (ret != KERN_RESOURCE_SHORTAGE) {
12808 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12809 		}
12810 		return ret;
12811 	}
12812 
12813 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
12814 	payload_len_aligned = round_page(pmap_img4_payload_len);
12815 	manifest_len_aligned = round_page(img4_manifest_len);
12816 
12817 	/* Ensure we have valid data passed in */
12818 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12819 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12820 
12821 	/*
12822 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12823 	 * data structure used by libTrustCache to manage the payload. We need to be able to
12824 	 * write to that data structure, so we keep the payload PPL writable.
12825 	 */
12826 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12827 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12828 
12829 	/* Should be safe to read from this now */
12830 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
12831 
12832 	/* Acquire a writable version of the trust cache data structure */
12833 	TrustCache_t *trust_cache = &payload->trust_cache;
12834 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12835 
12836 	/* Calculate the correct length of the img4 payload */
12837 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12838 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12839 	}
12840 
12841 	/* Exclusively lock the runtime */
12842 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12843 
12844 	/* Load the trust cache */
12845 	TCReturn_t tc_ret = amfi->TrustCache.load(
12846 		&ppl_trust_cache_rt,
12847 		type,
12848 		trust_cache,
12849 		(const uintptr_t)payload->img4_payload, img4_payload_len,
12850 		(const uintptr_t)img4_manifest, img4_manifest_len);
12851 
12852 	/* Unlock the runtime */
12853 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12854 
12855 	if (tc_ret.error == kTCReturnSuccess) {
12856 		ret = KERN_SUCCESS;
12857 	} else {
12858 		if (tc_ret.error == kTCReturnDuplicate) {
12859 			ret = KERN_ALREADY_IN_SET;
12860 		} else {
12861 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12862 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12863 
12864 			ret = KERN_FAILURE;
12865 		}
12866 
12867 		/* Unlock the payload data */
12868 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12869 		trust_cache = NULL;
12870 		payload = NULL;
12871 	}
12872 
12873 	/* Unlock the manifest since it is no longer needed */
12874 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12875 
12876 	/* Return the CoreCrypto reserved page back to the free list */
12877 	pmap_release_reserved_ppl_page();
12878 
12879 	return ret;
12880 }
12881 
12882 kern_return_t
12883 pmap_load_trust_cache_with_type(
12884 	TCType_t type,
12885 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12886 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12887 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12888 {
12889 	kern_return_t ret = KERN_DENIED;
12890 
12891 	ret = pmap_load_trust_cache_with_type_ppl(
12892 		type,
12893 		pmap_img4_payload, pmap_img4_payload_len,
12894 		img4_manifest, img4_manifest_len,
12895 		img4_aux_manifest, img4_aux_manifest_len);
12896 
12897 	while (ret == KERN_RESOURCE_SHORTAGE) {
12898 		/* Allocate a page from the free list */
12899 		pmap_alloc_page_for_ppl(0);
12900 
12901 		/* Attempt the call again */
12902 		ret = pmap_load_trust_cache_with_type_ppl(
12903 			type,
12904 			pmap_img4_payload, pmap_img4_payload_len,
12905 			img4_manifest, img4_manifest_len,
12906 			img4_aux_manifest, img4_aux_manifest_len);
12907 	}
12908 
12909 	return ret;
12910 }
12911 
12912 MARK_AS_PMAP_TEXT kern_return_t
12913 pmap_query_trust_cache_safe(
12914 	TCQueryType_t query_type,
12915 	const uint8_t cdhash[kTCEntryHashSize],
12916 	TrustCacheQueryToken_t *query_token)
12917 {
12918 	kern_return_t ret = KERN_NOT_FOUND;
12919 
12920 	/* Validate the query type preemptively */
12921 	if (query_type >= kTCQueryTypeTotal) {
12922 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12923 		return KERN_INVALID_ARGUMENT;
12924 	}
12925 
12926 	/* Lock the runtime as shared */
12927 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12928 
12929 	TCReturn_t tc_ret = amfi->TrustCache.query(
12930 		&ppl_trust_cache_rt,
12931 		query_type,
12932 		cdhash,
12933 		query_token);
12934 
12935 	/* Unlock the runtime */
12936 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12937 
12938 	if (tc_ret.error == kTCReturnSuccess) {
12939 		ret = KERN_SUCCESS;
12940 	} else if (tc_ret.error == kTCReturnNotFound) {
12941 		ret = KERN_NOT_FOUND;
12942 	} else {
12943 		ret = KERN_FAILURE;
12944 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
12945 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12946 	}
12947 
12948 	return ret;
12949 }
12950 
12951 MARK_AS_PMAP_TEXT kern_return_t
12952 pmap_query_trust_cache_internal(
12953 	TCQueryType_t query_type,
12954 	const uint8_t cdhash[kTCEntryHashSize],
12955 	TrustCacheQueryToken_t *query_token)
12956 {
12957 	kern_return_t ret = KERN_NOT_FOUND;
12958 	TrustCacheQueryToken_t query_token_safe = {0};
12959 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
12960 
12961 	/* Copy in the CDHash into PPL storage */
12962 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
12963 
12964 	/* Query through the safe API since we're in the PPL now */
12965 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
12966 
12967 	if (query_token != NULL) {
12968 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12969 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
12970 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12971 	}
12972 
12973 	return ret;
12974 }
12975 
12976 kern_return_t
12977 pmap_query_trust_cache(
12978 	TCQueryType_t query_type,
12979 	const uint8_t cdhash[kTCEntryHashSize],
12980 	TrustCacheQueryToken_t *query_token)
12981 {
12982 	kern_return_t ret = KERN_NOT_FOUND;
12983 
12984 	ret = pmap_query_trust_cache_ppl(
12985 		query_type,
12986 		cdhash,
12987 		query_token);
12988 
12989 	return ret;
12990 }
12991 
12992 MARK_AS_PMAP_DATA bool ppl_developer_mode_set =  false;
12993 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
12994 
12995 MARK_AS_PMAP_TEXT void
12996 pmap_toggle_developer_mode_internal(
12997 	bool state)
12998 {
12999 	bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
13000 
13001 	/*
13002 	 * Only the following state transitions are allowed:
13003 	 * -- not set --> false
13004 	 * -- not set --> true
13005 	 * -- true --> false
13006 	 * -- true --> true
13007 	 * -- false --> false
13008 	 *
13009 	 * We never allow false --> true transitions.
13010 	 */
13011 	bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
13012 
13013 	if ((current == false) && (state == true) && state_set) {
13014 		panic("PMAP_CS: attempted to enable developer mode incorrectly");
13015 	}
13016 
13017 	/* We're going to update the developer mode state, so update this first */
13018 	os_atomic_store(&ppl_developer_mode_set, true, relaxed);
13019 
13020 	/* Update the developer mode state on the system */
13021 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13022 }
13023 
13024 void
13025 pmap_toggle_developer_mode(
13026 	bool state)
13027 {
13028 	pmap_toggle_developer_mode_ppl(state);
13029 }
13030 
13031 #endif /* PMAP_CS_PPL_MONITOR */
13032 
13033 #if PMAP_CS_INCLUDE_CODE_SIGNING
13034 
13035 static int
13036 pmap_cs_profiles_rbtree_compare(
13037 	void *profile0,
13038 	void *profile1)
13039 {
13040 	if (profile0 < profile1) {
13041 		return -1;
13042 	} else if (profile0 > profile1) {
13043 		return 1;
13044 	}
13045 	return 0;
13046 }
13047 
13048 /* Red-black tree for managing provisioning profiles */
13049 MARK_AS_PMAP_DATA static
13050 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13051 
13052 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13053 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13054 
13055 /* Lock for the profile red-black tree */
13056 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13057 
13058 void
13059 pmap_initialize_provisioning_profiles(void)
13060 {
13061 	/* Initialize the profiles red-black tree lock */
13062 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13063 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13064 
13065 	/* Initialize the red-black tree itself */
13066 	RB_INIT(&pmap_cs_registered_profiles);
13067 
13068 	printf("initialized PPL provisioning profile data\n");
13069 }
13070 
13071 static bool
13072 pmap_is_testflight_profile(
13073 	pmap_cs_profile_t *profile_obj)
13074 {
13075 	const char *entitlement_name = "beta-reports-active";
13076 	const size_t entitlement_length = strlen(entitlement_name);
13077 	CEQueryOperation_t query[2] = {0};
13078 
13079 	/* If the profile provisions no entitlements, then it isn't a test flight one */
13080 	if (profile_obj->entitlements_ctx == NULL) {
13081 		return false;
13082 	}
13083 
13084 	/* Build our CoreEntitlements query */
13085 	query[0].opcode = kCEOpSelectKey;
13086 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13087 	query[0].parameters.stringParameter.length = entitlement_length;
13088 	query[1] = CEMatchBool(true);
13089 
13090 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13091 		profile_obj->entitlements_ctx,
13092 		query, 2);
13093 
13094 	if (ce_err == amfi->CoreEntitlements.kNoError) {
13095 		return true;
13096 	}
13097 
13098 	return false;
13099 }
13100 
13101 static bool
13102 pmap_is_development_profile(
13103 	pmap_cs_profile_t *profile_obj)
13104 {
13105 	/* Check for UPP */
13106 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13107 		*profile_obj->profile_ctx,
13108 		CESelectDictValue("ProvisionsAllDevices"));
13109 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13110 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13111 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13112 			return false;
13113 		}
13114 	}
13115 
13116 	/* Check for TestFlight profile */
13117 	if (pmap_is_testflight_profile(profile_obj) == true) {
13118 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13119 		return false;
13120 	}
13121 
13122 	pmap_cs_log_info("%p: development profile", profile_obj);
13123 	return true;
13124 }
13125 
13126 static kern_return_t
13127 pmap_initialize_profile_entitlements(
13128 	pmap_cs_profile_t *profile_obj)
13129 {
13130 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13131 		*profile_obj->profile_ctx,
13132 		CESelectDictValue("Entitlements"));
13133 
13134 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13135 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13136 		profile_obj->entitlements_ctx = NULL;
13137 
13138 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13139 		return KERN_NOT_FOUND;
13140 	}
13141 
13142 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13143 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13144 
13145 	CEValidationResult ce_result = {0};
13146 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
13147 		pmap_cs_core_entitlements_runtime,
13148 		&ce_result,
13149 		der_start, der_end);
13150 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13151 		pmap_cs_log_error("unable to validate profile entitlements: %s",
13152 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13153 
13154 		return KERN_ABORTED;
13155 	}
13156 
13157 	struct CEQueryContext query_ctx = {0};
13158 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13159 		pmap_cs_core_entitlements_runtime,
13160 		ce_result,
13161 		&query_ctx);
13162 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13163 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13164 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13165 
13166 		return KERN_ABORTED;
13167 	}
13168 
13169 	/* Setup the entitlements context within the profile object */
13170 	profile_obj->entitlements_ctx_storage = query_ctx;
13171 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13172 
13173 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13174 	return KERN_SUCCESS;
13175 }
13176 
13177 kern_return_t
13178 pmap_register_provisioning_profile_internal(
13179 	const vm_address_t payload_addr,
13180 	const vm_size_t payload_size)
13181 {
13182 	kern_return_t ret = KERN_DENIED;
13183 	pmap_cs_profile_t *profile_obj = NULL;
13184 	pmap_profile_payload_t *profile_payload = NULL;
13185 	vm_size_t max_profile_blob_size = 0;
13186 	const uint8_t *profile_content = NULL;
13187 	size_t profile_content_length = 0;
13188 
13189 
13190 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
13191 	ret = pmap_reserve_ppl_page();
13192 	if (ret != KERN_SUCCESS) {
13193 		if (ret != KERN_RESOURCE_SHORTAGE) {
13194 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13195 		}
13196 		return ret;
13197 	}
13198 
13199 	/* Ensure we have valid data passed in */
13200 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13201 
13202 	/*
13203 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
13204 	 * data structure used by the PPL to manage the payload. We need to be able to write
13205 	 * to that data structure, so we keep the payload PPL writable.
13206 	 */
13207 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13208 
13209 	/* Should be safe to read from this now */
13210 	profile_payload = (pmap_profile_payload_t*)payload_addr;
13211 
13212 	/* Ensure the profile blob size provided is valid */
13213 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13214 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13215 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13216 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13217 	}
13218 
13219 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13220 	const bool allow_development_root_cert = true;
13221 #else
13222 	const bool allow_development_root_cert = false;
13223 #endif
13224 
13225 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
13226 		profile_payload->profile_blob, profile_payload->profile_blob_size,
13227 		allow_development_root_cert,
13228 		&profile_content, &profile_content_length);
13229 
13230 	/* Release the PPL page allocated for CoreCrypto */
13231 	pmap_release_reserved_ppl_page();
13232 
13233 	if (ct_result != 0) {
13234 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13235 	} else if ((profile_content == NULL) || profile_content_length == 0) {
13236 		panic("PMAP_CS: profile does not have any content: %p | %lu",
13237 		    profile_content, profile_content_length);
13238 	}
13239 
13240 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13241 		pmap_cs_core_entitlements_runtime,
13242 		CCDER_CONSTRUCTED_SET,
13243 		false,
13244 		profile_content, profile_content + profile_content_length);
13245 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13246 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13247 	}
13248 
13249 	/* Acquire a writable version of the profile data structure */
13250 	profile_obj = &profile_payload->profile_obj_storage;
13251 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13252 
13253 	profile_obj->original_payload = profile_payload;
13254 	profile_obj->profile_ctx_storage = profile_ctx_storage;
13255 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13256 	os_atomic_store(&profile_obj->reference_count, 0, release);
13257 
13258 	/* Setup the entitlements provisioned by the profile */
13259 	ret = pmap_initialize_profile_entitlements(profile_obj);
13260 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13261 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13262 	}
13263 
13264 	/* Setup properties of the profile */
13265 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13266 
13267 	/* Mark as validated since it passed all checks */
13268 	profile_obj->profile_validated = true;
13269 
13270 	/* Add the profile to the red-black tree */
13271 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13272 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13273 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13274 	}
13275 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13276 
13277 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13278 	return KERN_SUCCESS;
13279 }
13280 
13281 kern_return_t
13282 pmap_register_provisioning_profile(
13283 	const vm_address_t payload_addr,
13284 	const vm_size_t payload_size)
13285 {
13286 	kern_return_t ret = KERN_DENIED;
13287 
13288 	ret = pmap_register_provisioning_profile_ppl(
13289 		payload_addr,
13290 		payload_size);
13291 
13292 	while (ret == KERN_RESOURCE_SHORTAGE) {
13293 		/* Allocate a page from the free list */
13294 		pmap_alloc_page_for_ppl(0);
13295 
13296 		/* Attempt the call again */
13297 		ret = pmap_register_provisioning_profile_ppl(
13298 			payload_addr,
13299 			payload_size);
13300 	}
13301 
13302 	return ret;
13303 }
13304 
13305 kern_return_t
13306 pmap_unregister_provisioning_profile_internal(
13307 	pmap_cs_profile_t *profile_obj)
13308 {
13309 	kern_return_t ret = KERN_DENIED;
13310 
13311 	/* Lock the red-black tree exclusively */
13312 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13313 
13314 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13315 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13316 	}
13317 
13318 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13319 	if (reference_count != 0) {
13320 		ret = KERN_FAILURE;
13321 		goto exit;
13322 	}
13323 
13324 	/* Remove the profile from the red-black tree */
13325 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13326 
13327 	/* Unregistration was a success */
13328 	ret = KERN_SUCCESS;
13329 
13330 exit:
13331 	/* Unlock the red-black tree */
13332 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13333 
13334 	if (ret == KERN_SUCCESS) {
13335 		/* Get the original payload address */
13336 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13337 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13338 
13339 		/* Get the original payload size */
13340 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13341 		payload_size = round_page(payload_size);
13342 
13343 		/* Unlock the profile payload */
13344 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13345 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13346 		    profile_payload, payload_size);
13347 
13348 		profile_obj = NULL;
13349 	}
13350 	return ret;
13351 }
13352 
13353 kern_return_t
13354 pmap_unregister_provisioning_profile(
13355 	pmap_cs_profile_t *profile_obj)
13356 {
13357 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13358 }
13359 
13360 kern_return_t
13361 pmap_associate_provisioning_profile_internal(
13362 	pmap_cs_code_directory_t *cd_entry,
13363 	pmap_cs_profile_t *profile_obj)
13364 {
13365 	kern_return_t ret = KERN_DENIED;
13366 
13367 	/* Acquire the lock on the code directory */
13368 	pmap_cs_lock_code_directory(cd_entry);
13369 
13370 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13371 		pmap_cs_log_error("disallowing profile association with verified signature");
13372 		goto exit;
13373 	} else if (cd_entry->profile_obj != NULL) {
13374 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13375 		goto exit;
13376 	}
13377 
13378 	/* Lock the red-black tree as shared */
13379 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13380 
13381 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13382 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13383 	} else if (profile_obj->profile_validated == false) {
13384 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13385 	}
13386 
13387 	/* Associate the profile with the signature */
13388 	cd_entry->profile_obj = profile_obj;
13389 
13390 	/* Increment the reference count on the profile object */
13391 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13392 	if (reference_count == 0) {
13393 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13394 	}
13395 
13396 	/* Unlock the red-black tree */
13397 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13398 
13399 	/* Association was a success */
13400 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13401 	ret = KERN_SUCCESS;
13402 
13403 exit:
13404 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13405 
13406 	return ret;
13407 }
13408 
13409 kern_return_t
13410 pmap_associate_provisioning_profile(
13411 	pmap_cs_code_directory_t *cd_entry,
13412 	pmap_cs_profile_t *profile_obj)
13413 {
13414 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13415 }
13416 
13417 kern_return_t
13418 pmap_disassociate_provisioning_profile_internal(
13419 	pmap_cs_code_directory_t *cd_entry)
13420 {
13421 	pmap_cs_profile_t *profile_obj = NULL;
13422 	kern_return_t ret = KERN_DENIED;
13423 
13424 	/* Acquire the lock on the code directory */
13425 	pmap_cs_lock_code_directory(cd_entry);
13426 
13427 	if (cd_entry->profile_obj == NULL) {
13428 		ret = KERN_NOT_FOUND;
13429 		goto exit;
13430 	}
13431 	profile_obj = cd_entry->profile_obj;
13432 
13433 	/* Disassociate the profile from the signature */
13434 	cd_entry->profile_obj = NULL;
13435 
13436 	/* Disassociation was a success */
13437 	ret = KERN_SUCCESS;
13438 
13439 exit:
13440 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13441 
13442 	if (ret == KERN_SUCCESS) {
13443 		/* Decrement the reference count on the profile object */
13444 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13445 		if (reference_count == UINT32_MAX) {
13446 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13447 		}
13448 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13449 	}
13450 	return ret;
13451 }
13452 
13453 kern_return_t
13454 pmap_disassociate_provisioning_profile(
13455 	pmap_cs_code_directory_t *cd_entry)
13456 {
13457 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13458 }
13459 
13460 kern_return_t
13461 pmap_associate_kernel_entitlements_internal(
13462 	pmap_cs_code_directory_t *cd_entry,
13463 	const void *kernel_entitlements)
13464 {
13465 	kern_return_t ret = KERN_DENIED;
13466 
13467 	if (kernel_entitlements == NULL) {
13468 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13469 	}
13470 
13471 	/* Acquire the lock on the code directory */
13472 	pmap_cs_lock_code_directory(cd_entry);
13473 
13474 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13475 		ret = KERN_DENIED;
13476 		goto out;
13477 	} else if (cd_entry->kernel_entitlements != NULL) {
13478 		ret = KERN_DENIED;
13479 		goto out;
13480 	}
13481 	cd_entry->kernel_entitlements = kernel_entitlements;
13482 
13483 	/* Association was a success */
13484 	ret = KERN_SUCCESS;
13485 
13486 out:
13487 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13488 	return ret;
13489 }
13490 
13491 kern_return_t
13492 pmap_associate_kernel_entitlements(
13493 	pmap_cs_code_directory_t *cd_entry,
13494 	const void *kernel_entitlements)
13495 {
13496 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13497 }
13498 
13499 kern_return_t
13500 pmap_resolve_kernel_entitlements_internal(
13501 	pmap_t pmap,
13502 	const void **kernel_entitlements)
13503 {
13504 	const void *entitlements = NULL;
13505 	pmap_cs_code_directory_t *cd_entry = NULL;
13506 	kern_return_t ret = KERN_DENIED;
13507 
13508 	/* Validate the PMAP object */
13509 	validate_pmap(pmap);
13510 
13511 	/* Ensure no kernel PMAP */
13512 	if (pmap == kernel_pmap) {
13513 		return KERN_NOT_FOUND;
13514 	}
13515 
13516 	/* Attempt a shared lock on the PMAP */
13517 	if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13518 		return KERN_ABORTED;
13519 	}
13520 
13521 	/*
13522 	 * Acquire the code signature from the PMAP. This function is called when
13523 	 * performing an entitlement check, and since we've confirmed this isn't
13524 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13525 	 * with a code signature.
13526 	 */
13527 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13528 	if (cd_entry == NULL) {
13529 		ret = KERN_NOT_FOUND;
13530 		goto out;
13531 	}
13532 
13533 	entitlements = cd_entry->kernel_entitlements;
13534 	if (entitlements == NULL) {
13535 		ret = KERN_NOT_FOUND;
13536 		goto out;
13537 	}
13538 
13539 	/* Pin and write out the entitlements object pointer */
13540 	if (kernel_entitlements != NULL) {
13541 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13542 		*kernel_entitlements = entitlements;
13543 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13544 	}
13545 
13546 	/* Successfully resolved the entitlements */
13547 	ret = KERN_SUCCESS;
13548 
13549 out:
13550 	/* Unlock the code signature object */
13551 	if (cd_entry != NULL) {
13552 		lck_rw_unlock_shared(&cd_entry->rwlock);
13553 		cd_entry = NULL;
13554 	}
13555 
13556 	/* Unlock the PMAP object */
13557 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
13558 
13559 	return ret;
13560 }
13561 
13562 kern_return_t
13563 pmap_resolve_kernel_entitlements(
13564 	pmap_t pmap,
13565 	const void **kernel_entitlements)
13566 {
13567 	kern_return_t ret = KERN_DENIED;
13568 
13569 	do {
13570 		ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13571 	} while (ret == KERN_ABORTED);
13572 
13573 	return ret;
13574 }
13575 
13576 kern_return_t
13577 pmap_accelerate_entitlements_internal(
13578 	pmap_cs_code_directory_t *cd_entry)
13579 {
13580 	const coreentitlements_t *CoreEntitlements = NULL;
13581 	const CS_SuperBlob *superblob = NULL;
13582 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13583 	size_t signature_length = 0;
13584 	size_t acceleration_length = 0;
13585 	size_t required_length = 0;
13586 	kern_return_t ret = KERN_DENIED;
13587 
13588 	/* Setup the CoreEntitlements interface */
13589 	CoreEntitlements = &amfi->CoreEntitlements;
13590 
13591 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13592 
13593 	/* Acquire the lock on the code directory */
13594 	pmap_cs_lock_code_directory(cd_entry);
13595 
13596 	/*
13597 	 * Only reconstituted code signatures can be accelerated. This is only a policy
13598 	 * decision we make since this allows us to re-use any unused space within the
13599 	 * locked down code signature region. There is also a decent bit of validation
13600 	 * within the reconstitution function to ensure blobs are ordered and do not
13601 	 * contain any padding around them which can cause issues here.
13602 	 *
13603 	 * This also serves as a check to ensure the signature is trusted.
13604 	 */
13605 	if (cd_entry->unneeded_code_signature_unlocked == false) {
13606 		ret = KERN_DENIED;
13607 		goto out;
13608 	}
13609 
13610 	if (cd_entry->ce_ctx == NULL) {
13611 		ret = KERN_SUCCESS;
13612 		goto out;
13613 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13614 		ret = KERN_SUCCESS;
13615 		goto out;
13616 	}
13617 
13618 	/* We only support accelerating when size <= PAGE_SIZE */
13619 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13620 	if (ce_err != CoreEntitlements->kNoError) {
13621 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13622 			/* Small entitlement blobs aren't eligible */
13623 			ret = KERN_SUCCESS;
13624 			goto out;
13625 		}
13626 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13627 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13628 	} else if (acceleration_length > PAGE_SIZE) {
13629 		ret = KERN_ABORTED;
13630 		goto out;
13631 	}
13632 	assert(acceleration_length > 0);
13633 
13634 	superblob = cd_entry->superblob;
13635 	signature_length = ntohl(superblob->length);
13636 
13637 	/* Adjust the required length for the overhead structure -- can't overflow */
13638 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13639 	if (required_length > PAGE_SIZE) {
13640 		ret = KERN_ABORTED;
13641 		goto out;
13642 	}
13643 
13644 	/*
13645 	 * First we'll check if the code signature has enough space within the locked down
13646 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13647 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
13648 	 * free list.
13649 	 *
13650 	 * When we're storing the buffer within the code signature, we also need to make
13651 	 * sure we account for alignment of the buffer.
13652 	 */
13653 	const vm_address_t align_mask = sizeof(void*) - 1;
13654 	size_t required_length_within_sig = required_length + align_mask;
13655 
13656 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13657 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13658 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13659 
13660 		/* We need to resolve to the physical aperture */
13661 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13662 		acceleration_buf = (void*)phystokv(phys_addr);
13663 
13664 		/* Ensure the offset within the page wasn't lost */
13665 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13666 
13667 		acceleration_buf->allocated = false;
13668 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13669 	} else {
13670 		if (required_length <= pmap_cs_blob_limit) {
13671 			struct pmap_cs_blob *bucket = NULL;
13672 			size_t bucket_size = 0;
13673 
13674 			/* Allocate a buffer from the blob allocator */
13675 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13676 			if (ret != KERN_SUCCESS) {
13677 				goto out;
13678 			}
13679 			acceleration_buf = (void*)bucket->blob;
13680 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13681 		} else {
13682 			pmap_paddr_t phys_addr = 0;
13683 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13684 			if (ret != KERN_SUCCESS) {
13685 				goto out;
13686 			}
13687 			acceleration_buf = (void*)phystokv(phys_addr);
13688 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13689 		}
13690 		acceleration_buf->allocated = true;
13691 	}
13692 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13693 	acceleration_buf->length = acceleration_length;
13694 
13695 	/* Take the acceleration buffer lock */
13696 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13697 
13698 	/* Setup the global acceleration buffer state */
13699 	pmap_cs_acceleration_buf = acceleration_buf;
13700 
13701 	/* Accelerate the entitlements */
13702 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
13703 	if (ce_err != CoreEntitlements->kNoError) {
13704 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
13705 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13706 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
13707 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
13708 	}
13709 
13710 	/*
13711 	 * The global acceleration buffer lock is unlocked by the allocation function itself
13712 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
13713 	 * an assert that the lock is unlocked here since another thread could have acquired
13714 	 * it by now.
13715 	 */
13716 	ret = KERN_SUCCESS;
13717 
13718 out:
13719 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13720 	return ret;
13721 }
13722 
13723 kern_return_t
13724 pmap_accelerate_entitlements(
13725 	pmap_cs_code_directory_t *cd_entry)
13726 {
13727 	kern_return_t ret = KERN_DENIED;
13728 
13729 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
13730 	while (ret == KERN_RESOURCE_SHORTAGE) {
13731 		/* Allocate a page for the PPL */
13732 		pmap_alloc_page_for_ppl(0);
13733 
13734 		/* Try again */
13735 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
13736 	}
13737 
13738 	return ret;
13739 }
13740 
13741 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13742 
13743 MARK_AS_PMAP_TEXT bool
13744 pmap_lookup_in_loaded_trust_caches_internal(
13745 	const uint8_t cdhash[CS_CDHASH_LEN])
13746 {
13747 	kern_return_t kr = KERN_NOT_FOUND;
13748 
13749 #if PMAP_CS_PPL_MONITOR
13750 	/*
13751 	 * If we have the PPL monitor, then this function can only be called from
13752 	 * within the PPL. Calling it directly would've caused a panic, so we can
13753 	 * assume that we're in the PPL here.
13754 	 */
13755 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13756 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13757 
13758 	kr = pmap_query_trust_cache_safe(
13759 		kTCQueryTypeLoadable,
13760 		cdhash_safe,
13761 		NULL);
13762 #else
13763 	kr = query_trust_cache(
13764 		kTCQueryTypeLoadable,
13765 		cdhash,
13766 		NULL);
13767 #endif
13768 
13769 	if (kr == KERN_SUCCESS) {
13770 		return true;
13771 	}
13772 	return false;
13773 }
13774 
13775 bool
13776 pmap_lookup_in_loaded_trust_caches(
13777 	const uint8_t cdhash[CS_CDHASH_LEN])
13778 {
13779 #if XNU_MONITOR
13780 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
13781 #else
13782 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
13783 #endif
13784 }
13785 
13786 MARK_AS_PMAP_TEXT uint32_t
13787 pmap_lookup_in_static_trust_cache_internal(
13788 	const uint8_t cdhash[CS_CDHASH_LEN])
13789 {
13790 	TrustCacheQueryToken_t query_token = {0};
13791 	kern_return_t kr = KERN_NOT_FOUND;
13792 	uint64_t flags = 0;
13793 	uint8_t hash_type = 0;
13794 
13795 #if PMAP_CS_PPL_MONITOR
13796 	/*
13797 	 * If we have the PPL monitor, then this function can only be called from
13798 	 * within the PPL. Calling it directly would've caused a panic, so we can
13799 	 * assume that we're in the PPL here.
13800 	 */
13801 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13802 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13803 
13804 	kr = pmap_query_trust_cache_safe(
13805 		kTCQueryTypeStatic,
13806 		cdhash_safe,
13807 		&query_token);
13808 #else
13809 	kr = query_trust_cache(
13810 		kTCQueryTypeStatic,
13811 		cdhash,
13812 		&query_token);
13813 #endif
13814 
13815 	if (kr == KERN_SUCCESS) {
13816 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
13817 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
13818 
13819 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
13820 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
13821 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
13822 	}
13823 
13824 	return 0;
13825 }
13826 
13827 uint32_t
13828 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
13829 {
13830 #if XNU_MONITOR
13831 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
13832 #else
13833 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
13834 #endif
13835 }
13836 
13837 #if PMAP_CS_INCLUDE_CODE_SIGNING
13838 
13839 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
13840 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
13841 
13842 MARK_AS_PMAP_TEXT void
13843 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13844 {
13845 
13846 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13847 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
13848 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13849 
13850 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
13851 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
13852 }
13853 
13854 MARK_AS_PMAP_TEXT bool
13855 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13856 {
13857 	bool match = false;
13858 
13859 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13860 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
13861 		match = true;
13862 	}
13863 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13864 
13865 	if (match) {
13866 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
13867 	}
13868 
13869 	return match;
13870 }
13871 
13872 void
13873 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13874 {
13875 #if XNU_MONITOR
13876 	pmap_set_compilation_service_cdhash_ppl(cdhash);
13877 #else
13878 	pmap_set_compilation_service_cdhash_internal(cdhash);
13879 #endif
13880 }
13881 
13882 bool
13883 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13884 {
13885 #if XNU_MONITOR
13886 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
13887 #else
13888 	return pmap_match_compilation_service_cdhash_internal(cdhash);
13889 #endif
13890 }
13891 
13892 /*
13893  * As part of supporting local signing on the device, we need the PMAP layer
13894  * to store the local signing key so that PMAP_CS can validate with it. We
13895  * store it at the PMAP layer such that it is accessible to both AMFI and
13896  * PMAP_CS should they need it.
13897  */
13898 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
13899 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
13900 
13901 MARK_AS_PMAP_TEXT void
13902 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13903 {
13904 	bool key_set = false;
13905 
13906 	/*
13907 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
13908 	 * a successful exchange means that the local signing public key has _not_ been
13909 	 * set. In case the key has been set, we panic as we would never expect the
13910 	 * kernel to attempt to set the key more than once.
13911 	 */
13912 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
13913 
13914 	if (key_set) {
13915 		panic("attempted to set the local signing public key multiple times");
13916 	}
13917 
13918 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
13919 	pmap_cs_log_info("set local signing public key");
13920 }
13921 
13922 void
13923 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13924 {
13925 #if XNU_MONITOR
13926 	return pmap_set_local_signing_public_key_ppl(public_key);
13927 #else
13928 	return pmap_set_local_signing_public_key_internal(public_key);
13929 #endif
13930 }
13931 
13932 uint8_t*
13933 pmap_get_local_signing_public_key(void)
13934 {
13935 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
13936 
13937 	if (key_set) {
13938 		return pmap_local_signing_public_key;
13939 	}
13940 
13941 	return NULL;
13942 }
13943 
13944 /*
13945  * Locally signed applications need to be explicitly authorized by an entitled application
13946  * before we allow them to run.
13947  */
13948 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
13949 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
13950 
13951 MARK_AS_PMAP_TEXT void
13952 pmap_unrestrict_local_signing_internal(
13953 	const uint8_t cdhash[CS_CDHASH_LEN])
13954 {
13955 
13956 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13957 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13958 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13959 
13960 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
13961 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
13962 }
13963 
13964 void
13965 pmap_unrestrict_local_signing(
13966 	const uint8_t cdhash[CS_CDHASH_LEN])
13967 {
13968 #if XNU_MONITOR
13969 	return pmap_unrestrict_local_signing_ppl(cdhash);
13970 #else
13971 	return pmap_unrestrict_local_signing_internal(cdhash);
13972 #endif
13973 }
13974 
13975 #if PMAP_CS
13976 MARK_AS_PMAP_TEXT static void
13977 pmap_restrict_local_signing(void)
13978 {
13979 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13980 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
13981 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13982 }
13983 
13984 MARK_AS_PMAP_TEXT static bool
13985 pmap_local_signing_restricted(
13986 	const uint8_t cdhash[CS_CDHASH_LEN])
13987 {
13988 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13989 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13990 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13991 
13992 	return ret != 0;
13993 }
13994 
13995 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13996 #endif
13997 
13998 MARK_AS_PMAP_TEXT void
13999 pmap_footprint_suspend_internal(
14000 	vm_map_t        map,
14001 	boolean_t       suspend)
14002 {
14003 #if DEVELOPMENT || DEBUG
14004 	if (suspend) {
14005 		current_thread()->pmap_footprint_suspended = TRUE;
14006 		map->pmap->footprint_was_suspended = TRUE;
14007 	} else {
14008 		current_thread()->pmap_footprint_suspended = FALSE;
14009 	}
14010 #else /* DEVELOPMENT || DEBUG */
14011 	(void) map;
14012 	(void) suspend;
14013 #endif /* DEVELOPMENT || DEBUG */
14014 }
14015 
14016 void
14017 pmap_footprint_suspend(
14018 	vm_map_t map,
14019 	boolean_t suspend)
14020 {
14021 #if XNU_MONITOR
14022 	pmap_footprint_suspend_ppl(map, suspend);
14023 #else
14024 	pmap_footprint_suspend_internal(map, suspend);
14025 #endif
14026 }
14027 
14028 MARK_AS_PMAP_TEXT void
14029 pmap_nop_internal(pmap_t pmap __unused)
14030 {
14031 	validate_pmap_mutable(pmap);
14032 }
14033 
14034 void
14035 pmap_nop(pmap_t pmap)
14036 {
14037 #if XNU_MONITOR
14038 	pmap_nop_ppl(pmap);
14039 #else
14040 	pmap_nop_internal(pmap);
14041 #endif
14042 }
14043 
14044 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14045 
14046 struct page_table_dump_header {
14047 	uint64_t pa;
14048 	uint64_t num_entries;
14049 	uint64_t start_va;
14050 	uint64_t end_va;
14051 };
14052 
14053 static kern_return_t
14054 pmap_dump_page_tables_recurse(pmap_t pmap,
14055     const tt_entry_t *ttp,
14056     unsigned int cur_level,
14057     unsigned int level_mask,
14058     uint64_t start_va,
14059     void *buf_start,
14060     void *buf_end,
14061     size_t *bytes_copied)
14062 {
14063 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14064 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14065 
14066 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
14067 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14068 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14069 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14070 
14071 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
14072 
14073 	if (cur_level == pt_attr_root_level(pt_attr)) {
14074 		start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14075 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14076 	}
14077 
14078 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14079 	const tt_entry_t *tt_end = &ttp[num_entries];
14080 
14081 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14082 		return KERN_INSUFFICIENT_BUFFER_SIZE;
14083 	}
14084 
14085 	if (level_mask & (1U << cur_level)) {
14086 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14087 		header->pa = ml_static_vtop((vm_offset_t)ttp);
14088 		header->num_entries = num_entries;
14089 		header->start_va = start_va;
14090 		header->end_va = start_va + (num_entries * size);
14091 
14092 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14093 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14094 	}
14095 	uint64_t current_va = start_va;
14096 
14097 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14098 		tt_entry_t tte = *ttep;
14099 
14100 		if (!(tte & valid_mask)) {
14101 			continue;
14102 		}
14103 
14104 		if ((tte & type_mask) == type_block) {
14105 			continue;
14106 		} else {
14107 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14108 				panic("%s: corrupt entry %#llx at %p, "
14109 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14110 				    __FUNCTION__, tte, ttep,
14111 				    ttp, cur_level, bufp, buf_end);
14112 			}
14113 
14114 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14115 
14116 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14117 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
14118 
14119 			if (recurse_result != KERN_SUCCESS) {
14120 				return recurse_result;
14121 			}
14122 		}
14123 	}
14124 
14125 	return KERN_SUCCESS;
14126 }
14127 
14128 kern_return_t
14129 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14130 {
14131 	if (not_in_kdp) {
14132 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
14133 	}
14134 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14135 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
14136 }
14137 
14138 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14139 
14140 kern_return_t
14141 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14142     unsigned int level_mask __unused, size_t *bytes_copied __unused)
14143 {
14144 	return KERN_NOT_SUPPORTED;
14145 }
14146 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14147 
14148 
14149 #ifdef CONFIG_XNUPOST
14150 #ifdef __arm64__
14151 static volatile bool pmap_test_took_fault = false;
14152 
14153 static bool
14154 pmap_test_fault_handler(arm_saved_state_t * state)
14155 {
14156 	bool retval                 = false;
14157 	uint32_t esr                = get_saved_state_esr(state);
14158 	esr_exception_class_t class = ESR_EC(esr);
14159 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
14160 
14161 	if ((class == ESR_EC_DABORT_EL1) &&
14162 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14163 		pmap_test_took_fault = true;
14164 		/* return to the instruction immediately after the call to NX page */
14165 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14166 		retval = true;
14167 	}
14168 
14169 	return retval;
14170 }
14171 
14172 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14173 static NOKASAN bool
14174 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14175 {
14176 	pmap_t old_pmap = NULL;
14177 
14178 	pmap_test_took_fault = false;
14179 
14180 	/*
14181 	 * We're potentially switching pmaps without using the normal thread
14182 	 * mechanism; disable interrupts and preemption to avoid any unexpected
14183 	 * memory accesses.
14184 	 */
14185 	uint64_t old_int_state = pmap_interrupts_disable();
14186 	mp_disable_preemption();
14187 
14188 	if (pmap != NULL) {
14189 		old_pmap = current_pmap();
14190 		pmap_switch(pmap);
14191 
14192 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
14193 #if __ARM_PAN_AVAILABLE__
14194 		__builtin_arm_wsr("pan", 0);
14195 #endif /* __ARM_PAN_AVAILABLE__ */
14196 	}
14197 
14198 	ml_expect_fault_begin(pmap_test_fault_handler, va);
14199 
14200 	if (is_write) {
14201 		*((volatile uint64_t*)(va)) = 0xdec0de;
14202 	} else {
14203 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
14204 		(void)tmp;
14205 	}
14206 
14207 	/* Save the fault bool, and undo the gross stuff we did. */
14208 	bool took_fault = pmap_test_took_fault;
14209 	ml_expect_fault_end();
14210 
14211 	if (pmap != NULL) {
14212 #if __ARM_PAN_AVAILABLE__
14213 		__builtin_arm_wsr("pan", 1);
14214 #endif /* __ARM_PAN_AVAILABLE__ */
14215 
14216 		pmap_switch(old_pmap);
14217 	}
14218 
14219 	mp_enable_preemption();
14220 	pmap_interrupts_restore(old_int_state);
14221 	bool retval = (took_fault == should_fault);
14222 	return retval;
14223 }
14224 
14225 static bool
14226 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14227 {
14228 	bool retval = pmap_test_access(pmap, va, should_fault, false);
14229 
14230 	if (!retval) {
14231 		T_FAIL("%s: %s, "
14232 		    "pmap=%p, va=%p, should_fault=%u",
14233 		    __func__, should_fault ? "did not fault" : "faulted",
14234 		    pmap, (void*)va, (unsigned)should_fault);
14235 	}
14236 
14237 	return retval;
14238 }
14239 
14240 static bool
14241 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14242 {
14243 	bool retval = pmap_test_access(pmap, va, should_fault, true);
14244 
14245 	if (!retval) {
14246 		T_FAIL("%s: %s, "
14247 		    "pmap=%p, va=%p, should_fault=%u",
14248 		    __func__, should_fault ? "did not fault" : "faulted",
14249 		    pmap, (void*)va, (unsigned)should_fault);
14250 	}
14251 
14252 	return retval;
14253 }
14254 
14255 static bool
14256 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14257 {
14258 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14259 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14260 
14261 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14262 
14263 	if (!retval) {
14264 		T_FAIL("%s: bits=%u, "
14265 		    "pa=%p, should_be_set=%u",
14266 		    __func__, bits,
14267 		    (void*)pa, should_be_set);
14268 	}
14269 
14270 	return retval;
14271 }
14272 
14273 static __attribute__((noinline)) bool
14274 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14275 {
14276 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14277 	return retval;
14278 }
14279 
14280 static int
14281 pmap_test_test_config(unsigned int flags)
14282 {
14283 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
14284 	unsigned int map_count = 0;
14285 	unsigned long page_ratio = 0;
14286 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
14287 
14288 	if (!pmap) {
14289 		panic("Failed to allocate pmap");
14290 	}
14291 
14292 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14293 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14294 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14295 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14296 
14297 	if (pmap_page_size <= native_page_size) {
14298 		page_ratio = native_page_size / pmap_page_size;
14299 	} else {
14300 		/*
14301 		 * We claim to support a page_ratio of less than 1, which is
14302 		 * not currently supported by the pmap layer; panic.
14303 		 */
14304 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14305 		    "flags=%u",
14306 		    __func__, native_page_size, pmap_page_size,
14307 		    flags);
14308 	}
14309 
14310 	if (PAGE_RATIO > 1) {
14311 		/*
14312 		 * The kernel is deliberately pretending to have 16KB pages.
14313 		 * The pmap layer has code that supports this, so pretend the
14314 		 * page size is larger than it is.
14315 		 */
14316 		pmap_page_size = PAGE_SIZE;
14317 		native_page_size = PAGE_SIZE;
14318 	}
14319 
14320 	/*
14321 	 * Get two pages from the VM; one to be mapped wired, and one to be
14322 	 * mapped nonwired.
14323 	 */
14324 	vm_page_t unwired_vm_page = vm_page_grab();
14325 	vm_page_t wired_vm_page = vm_page_grab();
14326 
14327 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14328 		panic("Failed to grab VM pages");
14329 	}
14330 
14331 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14332 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14333 
14334 	pmap_paddr_t pa = ptoa(pn);
14335 	pmap_paddr_t wired_pa = ptoa(wired_pn);
14336 
14337 	/*
14338 	 * We'll start mappings at the second twig TT.  This keeps us from only
14339 	 * using the first entry in each TT, which would trivially be address
14340 	 * 0; one of the things we will need to test is retrieving the VA for
14341 	 * a given PTE.
14342 	 */
14343 	vm_map_address_t va_base = pmap_twig_size;
14344 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14345 
14346 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14347 		/*
14348 		 * Not exactly a functional failure, but this test relies on
14349 		 * there being a spare PTE slot we can use to pin the TT.
14350 		 */
14351 		panic("Cannot pin translation table");
14352 	}
14353 
14354 	/*
14355 	 * Create the wired mapping; this will prevent the pmap layer from
14356 	 * reclaiming our test TTs, which would interfere with this test
14357 	 * ("interfere" -> "make it panic").
14358 	 */
14359 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14360 
14361 #if XNU_MONITOR
14362 	/*
14363 	 * If the PPL is enabled, make sure that the kernel cannot write
14364 	 * to PPL memory.
14365 	 */
14366 	if (!pmap_ppl_disable) {
14367 		T_LOG("Validate that kernel cannot write to PPL memory.");
14368 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14369 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14370 	}
14371 #endif
14372 
14373 	/*
14374 	 * Create read-only mappings of the nonwired page; if the pmap does
14375 	 * not use the same page size as the kernel, create multiple mappings
14376 	 * so that the kernel page is fully mapped.
14377 	 */
14378 	for (map_count = 0; map_count < page_ratio; map_count++) {
14379 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14380 	}
14381 
14382 	/* Validate that all the PTEs have the expected PA and VA. */
14383 	for (map_count = 0; map_count < page_ratio; map_count++) {
14384 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14385 
14386 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14387 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14388 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14389 		}
14390 
14391 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14392 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14393 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14394 		}
14395 	}
14396 
14397 	T_LOG("Validate that reads to our mapping do not fault.");
14398 	pmap_test_read(pmap, va_base, false);
14399 
14400 	T_LOG("Validate that writes to our mapping fault.");
14401 	pmap_test_write(pmap, va_base, true);
14402 
14403 	T_LOG("Make the first mapping writable.");
14404 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14405 
14406 	T_LOG("Validate that writes to our mapping do not fault.");
14407 	pmap_test_write(pmap, va_base, false);
14408 
14409 
14410 	T_LOG("Make the first mapping execute-only");
14411 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14412 
14413 
14414 	T_LOG("Validate that reads to our mapping do not fault.");
14415 	pmap_test_read(pmap, va_base, false);
14416 
14417 	T_LOG("Validate that writes to our mapping fault.");
14418 	pmap_test_write(pmap, va_base, true);
14419 
14420 
14421 	/*
14422 	 * For page ratios of greater than 1: validate that writes to the other
14423 	 * mappings still fault.  Remove the mappings afterwards (we're done
14424 	 * with page ratio testing).
14425 	 */
14426 	for (map_count = 1; map_count < page_ratio; map_count++) {
14427 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14428 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14429 	}
14430 
14431 	T_LOG("Mark the page unreferenced and unmodified.");
14432 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14433 	pmap_test_check_refmod(pa, 0);
14434 
14435 	/*
14436 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
14437 	 * different protection/fault_type settings, and confirm that the
14438 	 * ref/mod state matches our expectations at each step.
14439 	 */
14440 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
14441 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14442 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14443 
14444 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
14445 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14446 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14447 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14448 
14449 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
14450 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14451 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14452 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14453 
14454 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
14455 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14456 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14457 
14458 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
14459 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14460 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14461 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14462 
14463 	/*
14464 	 * Shared memory testing; we'll have two mappings; one read-only,
14465 	 * one read-write.
14466 	 */
14467 	vm_map_address_t rw_base = va_base;
14468 	vm_map_address_t ro_base = va_base + pmap_page_size;
14469 
14470 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14471 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14472 
14473 	/*
14474 	 * Test that we take faults as expected for unreferenced/unmodified
14475 	 * pages.  Also test the arm_fast_fault interface, to ensure that
14476 	 * mapping permissions change as expected.
14477 	 */
14478 	T_LOG("!ref/!mod: expect no access");
14479 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14480 	pmap_test_read_write(pmap, ro_base, false, false);
14481 	pmap_test_read_write(pmap, rw_base, false, false);
14482 
14483 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14484 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14485 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14486 	pmap_test_read_write(pmap, ro_base, true, false);
14487 	pmap_test_read_write(pmap, rw_base, true, false);
14488 
14489 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14490 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14491 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14492 	pmap_test_read_write(pmap, ro_base, true, false);
14493 	pmap_test_read_write(pmap, rw_base, true, true);
14494 
14495 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14496 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14497 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14498 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14499 	pmap_test_read_write(pmap, ro_base, true, false);
14500 	pmap_test_read_write(pmap, rw_base, true, true);
14501 
14502 	T_LOG("RW protect both mappings; should not change protections.");
14503 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14504 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14505 	pmap_test_read_write(pmap, ro_base, true, false);
14506 	pmap_test_read_write(pmap, rw_base, true, true);
14507 
14508 	T_LOG("Read protect both mappings; RW mapping should become RO.");
14509 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14510 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14511 	pmap_test_read_write(pmap, ro_base, true, false);
14512 	pmap_test_read_write(pmap, rw_base, true, false);
14513 
14514 	T_LOG("RW protect the page; mappings should not change protections.");
14515 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14516 	pmap_page_protect(pn, VM_PROT_ALL);
14517 	pmap_test_read_write(pmap, ro_base, true, false);
14518 	pmap_test_read_write(pmap, rw_base, true, true);
14519 
14520 	T_LOG("Read protect the page; RW mapping should become RO.");
14521 	pmap_page_protect(pn, VM_PROT_READ);
14522 	pmap_test_read_write(pmap, ro_base, true, false);
14523 	pmap_test_read_write(pmap, rw_base, true, false);
14524 
14525 	T_LOG("Validate that disconnect removes all known mappings of the page.");
14526 	pmap_disconnect(pn);
14527 	if (!pmap_verify_free(pn)) {
14528 		T_FAIL("Page still has mappings");
14529 	}
14530 
14531 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
14532 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14533 	pmap_destroy(pmap);
14534 
14535 	T_LOG("Release the pages back to the VM.");
14536 	vm_page_lock_queues();
14537 	vm_page_free(unwired_vm_page);
14538 	vm_page_free(wired_vm_page);
14539 	vm_page_unlock_queues();
14540 
14541 	T_LOG("Testing successful!");
14542 	return 0;
14543 }
14544 #endif /* __arm64__ */
14545 
14546 kern_return_t
14547 pmap_test(void)
14548 {
14549 	T_LOG("Starting pmap_tests");
14550 #ifdef __arm64__
14551 	int flags = 0;
14552 	flags |= PMAP_CREATE_64BIT;
14553 
14554 #if __ARM_MIXED_PAGE_SIZE__
14555 	T_LOG("Testing VM_PAGE_SIZE_4KB");
14556 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14557 	T_LOG("Testing VM_PAGE_SIZE_16KB");
14558 	pmap_test_test_config(flags);
14559 #else /* __ARM_MIXED_PAGE_SIZE__ */
14560 	pmap_test_test_config(flags);
14561 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14562 
14563 #endif /* __arm64__ */
14564 	T_PASS("completed pmap_test successfully");
14565 	return KERN_SUCCESS;
14566 }
14567 #endif /* CONFIG_XNUPOST */
14568 
14569 /*
14570  * The following function should never make it to RELEASE code, since
14571  * it provides a way to get the PPL to modify text pages.
14572  */
14573 #if DEVELOPMENT || DEBUG
14574 
14575 #define ARM_UNDEFINED_INSN 0xe7f000f0
14576 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14577 
14578 /**
14579  * Forcibly overwrite executable text with an illegal instruction.
14580  *
14581  * @note Only used for xnu unit testing.
14582  *
14583  * @param pa The physical address to corrupt.
14584  *
14585  * @return KERN_SUCCESS on success.
14586  */
14587 kern_return_t
14588 pmap_test_text_corruption(pmap_paddr_t pa)
14589 {
14590 #if XNU_MONITOR
14591 	return pmap_test_text_corruption_ppl(pa);
14592 #else /* XNU_MONITOR */
14593 	return pmap_test_text_corruption_internal(pa);
14594 #endif /* XNU_MONITOR */
14595 }
14596 
14597 MARK_AS_PMAP_TEXT kern_return_t
14598 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14599 {
14600 	vm_offset_t va = phystokv(pa);
14601 	unsigned int pai = pa_index(pa);
14602 
14603 	assert(pa_valid(pa));
14604 
14605 	pvh_lock(pai);
14606 
14607 	pv_entry_t **pv_h  = pai_to_pvh(pai);
14608 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14609 #if defined(PVH_FLAG_EXEC)
14610 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14611 
14612 	if (need_ap_twiddle) {
14613 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14614 	}
14615 #endif /* defined(PVH_FLAG_EXEC) */
14616 
14617 	/*
14618 	 * The low bit in an instruction address indicates a THUMB instruction
14619 	 */
14620 	if (va & 1) {
14621 		va &= ~(vm_offset_t)1;
14622 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14623 	} else {
14624 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
14625 	}
14626 
14627 #if defined(PVH_FLAG_EXEC)
14628 	if (need_ap_twiddle) {
14629 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14630 	}
14631 #endif /* defined(PVH_FLAG_EXEC) */
14632 
14633 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14634 
14635 	pvh_unlock(pai);
14636 
14637 	return KERN_SUCCESS;
14638 }
14639 
14640 #endif /* DEVELOPMENT || DEBUG */
14641