xref: /xnu-10002.61.3/osfmk/arm/pmap/pmap.c (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap.h>
84 
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89 
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93 
94 #include <pexpert/device_tree.h>
95 
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98 
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102 
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106 
107 
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111 
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121 
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123 
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125 
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131 
132 const struct page_table_ops native_pt_ops =
133 {
134 	.alloc_id = alloc_asid,
135 	.free_id = free_asid,
136 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 	.wimg_to_pte = wimg_to_pte,
139 };
140 
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 	[0] = {
144 		.size       = ARM_16K_TT_L0_SIZE,
145 		.offmask    = ARM_16K_TT_L0_OFFMASK,
146 		.shift      = ARM_16K_TT_L0_SHIFT,
147 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 		.valid_mask = ARM_TTE_VALID,
149 		.type_mask  = ARM_TTE_TYPE_MASK,
150 		.type_block = ARM_TTE_TYPE_BLOCK
151 	},
152 	[1] = {
153 		.size       = ARM_16K_TT_L1_SIZE,
154 		.offmask    = ARM_16K_TT_L1_OFFMASK,
155 		.shift      = ARM_16K_TT_L1_SHIFT,
156 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 		.valid_mask = ARM_TTE_VALID,
158 		.type_mask  = ARM_TTE_TYPE_MASK,
159 		.type_block = ARM_TTE_TYPE_BLOCK
160 	},
161 	[2] = {
162 		.size       = ARM_16K_TT_L2_SIZE,
163 		.offmask    = ARM_16K_TT_L2_OFFMASK,
164 		.shift      = ARM_16K_TT_L2_SHIFT,
165 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 		.valid_mask = ARM_TTE_VALID,
167 		.type_mask  = ARM_TTE_TYPE_MASK,
168 		.type_block = ARM_TTE_TYPE_BLOCK
169 	},
170 	[3] = {
171 		.size       = ARM_16K_TT_L3_SIZE,
172 		.offmask    = ARM_16K_TT_L3_OFFMASK,
173 		.shift      = ARM_16K_TT_L3_SHIFT,
174 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 		.valid_mask = ARM_PTE_TYPE_VALID,
176 		.type_mask  = ARM_PTE_TYPE_MASK,
177 		.type_block = ARM_TTE_TYPE_L3BLOCK
178 	}
179 };
180 
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 	[0] = {
184 		.size       = ARM_4K_TT_L0_SIZE,
185 		.offmask    = ARM_4K_TT_L0_OFFMASK,
186 		.shift      = ARM_4K_TT_L0_SHIFT,
187 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 		.valid_mask = ARM_TTE_VALID,
189 		.type_mask  = ARM_TTE_TYPE_MASK,
190 		.type_block = ARM_TTE_TYPE_BLOCK
191 	},
192 	[1] = {
193 		.size       = ARM_4K_TT_L1_SIZE,
194 		.offmask    = ARM_4K_TT_L1_OFFMASK,
195 		.shift      = ARM_4K_TT_L1_SHIFT,
196 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 		.valid_mask = ARM_TTE_VALID,
198 		.type_mask  = ARM_TTE_TYPE_MASK,
199 		.type_block = ARM_TTE_TYPE_BLOCK
200 	},
201 	[2] = {
202 		.size       = ARM_4K_TT_L2_SIZE,
203 		.offmask    = ARM_4K_TT_L2_OFFMASK,
204 		.shift      = ARM_4K_TT_L2_SHIFT,
205 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 		.valid_mask = ARM_TTE_VALID,
207 		.type_mask  = ARM_TTE_TYPE_MASK,
208 		.type_block = ARM_TTE_TYPE_BLOCK
209 	},
210 	[3] = {
211 		.size       = ARM_4K_TT_L3_SIZE,
212 		.offmask    = ARM_4K_TT_L3_OFFMASK,
213 		.shift      = ARM_4K_TT_L3_SHIFT,
214 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 		.valid_mask = ARM_PTE_TYPE_VALID,
216 		.type_mask  = ARM_PTE_TYPE_MASK,
217 		.type_block = ARM_TTE_TYPE_L3BLOCK
218 	}
219 };
220 
221 const struct page_table_attr pmap_pt_attr_4k = {
222 	.pta_level_info = pmap_table_level_info_4k,
223 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 	.pta_max_level  = PMAP_TT_L3_LEVEL,
234 	.pta_ops = &native_pt_ops,
235 	.ap_ro = ARM_PTE_AP(AP_RORO),
236 	.ap_rw = ARM_PTE_AP(AP_RWRW),
237 	.ap_rona = ARM_PTE_AP(AP_RONA),
238 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
239 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 	.ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 	.pta_tcr_value  = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 	.pta_page_size  = 4096,
245 	.pta_page_shift = 12,
246 };
247 
248 const struct page_table_attr pmap_pt_attr_16k = {
249 	.pta_level_info = pmap_table_level_info_16k,
250 	.pta_root_level = PMAP_TT_L1_LEVEL,
251 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
252 	.pta_max_level  = PMAP_TT_L3_LEVEL,
253 	.pta_ops = &native_pt_ops,
254 	.ap_ro = ARM_PTE_AP(AP_RORO),
255 	.ap_rw = ARM_PTE_AP(AP_RWRW),
256 	.ap_rona = ARM_PTE_AP(AP_RONA),
257 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
258 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
259 	.ap_x = ARM_PTE_PNX,
260 #if __ARM_MIXED_PAGE_SIZE__
261 	.pta_tcr_value  = TCR_EL1_16KB,
262 #endif /* __ARM_MIXED_PAGE_SIZE__ */
263 	.pta_page_size  = 16384,
264 	.pta_page_shift = 14,
265 };
266 
267 #if __ARM_16K_PG__
268 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
269 #else /* !__ARM_16K_PG__ */
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
271 #endif /* !__ARM_16K_PG__ */
272 
273 
274 #if MACH_ASSERT
275 int vm_footprint_suspend_allowed = 1;
276 
277 extern int pmap_ledgers_panic;
278 extern int pmap_ledgers_panic_leeway;
279 
280 #endif /* MACH_ASSERT */
281 
282 #if DEVELOPMENT || DEBUG
283 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
284 	(current_thread()->pmap_footprint_suspended)
285 #else /* DEVELOPMENT || DEBUG */
286 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
287 #endif /* DEVELOPMENT || DEBUG */
288 
289 
290 /*
291  * Represents a tlb range that will be flushed before exiting
292  * the ppl.
293  * Used by phys_attribute_clear_range to defer flushing pages in
294  * this range until the end of the operation.
295  */
296 typedef struct pmap_tlb_flush_range {
297 	pmap_t ptfr_pmap;
298 	vm_map_address_t ptfr_start;
299 	vm_map_address_t ptfr_end;
300 	bool ptfr_flush_needed;
301 } pmap_tlb_flush_range_t;
302 
303 #if XNU_MONITOR
304 /*
305  * PPL External References.
306  */
307 extern vm_offset_t   segPPLDATAB;
308 extern unsigned long segSizePPLDATA;
309 extern vm_offset_t   segPPLTEXTB;
310 extern unsigned long segSizePPLTEXT;
311 extern vm_offset_t   segPPLDATACONSTB;
312 extern unsigned long segSizePPLDATACONST;
313 
314 
315 /*
316  * PPL Global Variables
317  */
318 
319 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
320 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
321 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
322 #else
323 const boolean_t pmap_ppl_disable = FALSE;
324 #endif
325 
326 /*
327  * Indicates if the PPL has started applying APRR.
328  * This variable is accessed from various assembly trampolines, so be sure to change
329  * those if you change the size or layout of this variable.
330  */
331 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
332 
333 extern void *pmap_stacks_start;
334 extern void *pmap_stacks_end;
335 
336 #endif /* !XNU_MONITOR */
337 
338 
339 
340 /* Virtual memory region for early allocation */
341 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
342 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
343 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
344 
345 extern uint8_t bootstrap_pagetables[];
346 
347 extern unsigned int not_in_kdp;
348 
349 extern vm_offset_t first_avail;
350 
351 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
352 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
353 extern vm_offset_t     static_memory_end;
354 
355 extern const vm_map_address_t physmap_base;
356 extern const vm_map_address_t physmap_end;
357 
358 extern int maxproc, hard_maxproc;
359 
360 /* The number of address bits one TTBR can cover. */
361 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
362 
363 /*
364  * The bounds on our TTBRs.  These are for sanity checking that
365  * an address is accessible by a TTBR before we attempt to map it.
366  */
367 
368 /* The level of the root of a page table. */
369 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
370 
371 /* The number of entries in the root TT of a page table. */
372 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
373 
374 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
375 const pmap_t    kernel_pmap = &kernel_pmap_store;
376 
377 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
378 
379 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
380 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
381 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
382 
383 typedef struct tt_free_entry {
384 	struct tt_free_entry    *next;
385 } tt_free_entry_t;
386 
387 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
388 
389 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
390 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
391 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
392 #define FREE_PAGE_SIZE_TT_MAX   4
393 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
394 unsigned int    free_two_page_size_tt_count MARK_AS_PMAP_DATA;
395 unsigned int    free_two_page_size_tt_max MARK_AS_PMAP_DATA;
396 #define FREE_TWO_PAGE_SIZE_TT_MAX       4
397 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
398 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
399 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
400 
401 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
402 
403 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
404 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
405 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
406 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
407 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
408 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
409 
410 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
411 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
412 
413 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
414 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
415 
416 /* Lock group used for all pmap object locks. */
417 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
418 
419 #if DEVELOPMENT || DEBUG
420 int nx_enabled = 1;                                     /* enable no-execute protection */
421 int allow_data_exec  = 0;                               /* No apps may execute data */
422 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
423 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
424 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
425 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
426 #else /* DEVELOPMENT || DEBUG */
427 const int nx_enabled = 1;                                       /* enable no-execute protection */
428 const int allow_data_exec  = 0;                         /* No apps may execute data */
429 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
430 #endif /* DEVELOPMENT || DEBUG */
431 
432 /**
433  * This variable is set true during hibernation entry to protect pmap data structures
434  * during image copying, and reset false on hibernation exit.
435  */
436 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
437 
438 #if MACH_ASSERT
439 static void pmap_check_ledgers(pmap_t pmap);
440 #else
441 static inline void
pmap_check_ledgers(__unused pmap_t pmap)442 pmap_check_ledgers(__unused pmap_t pmap)
443 {
444 }
445 #endif /* MACH_ASSERT */
446 
447 /**
448  * This helper function ensures that potentially-long-running batched PPL operations are
449  * called in preemptible context before entering the PPL, so that the PPL call may
450  * periodically exit to allow pending urgent ASTs to be taken.
451  */
452 static inline void
pmap_verify_preemptible(void)453 pmap_verify_preemptible(void)
454 {
455 	assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
456 }
457 
458 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
459 
460 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
462 
463 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
464 
465 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
466 #if defined(__arm64__)
467 /* end of shared region + 512MB for various purposes */
468 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
469 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
470     "Minimum address space size outside allowable range");
471 
472 // Max offset is 15.375GB for devices with "large" memory config
473 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
474 // Max offset is 11.375GB for devices with "small" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
476 
477 
478 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
479     "Large device address space size outside allowable range");
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
481     "Small device address space size outside allowable range");
482 
483 #  ifdef XNU_TARGET_OS_OSX
484 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
485 #  else
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
487 #  endif
488 #endif /* __arm64__ */
489 
490 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
491 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
492 #else
493 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
494 #endif
495 
496 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
497 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
498 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
499 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
500 #if !HAS_16BIT_ASID
501 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
502 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
503 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
504 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
505 #else
506 static uint16_t last_allocated_asid = 0;
507 #endif /* !HAS_16BIT_ASID */
508 
509 
510 #if __ARM_MIXED_PAGE_SIZE__
511 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
512 #endif
513 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
514 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
515 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
516 
517 /* PTE Define Macros */
518 
519 #define ARM_PTE_IS_COMPRESSED(x, p) \
520 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
521 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
522 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
523 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
524 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
525 
526 #define pte_is_wired(pte)                                                               \
527 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
528 
529 #define pte_was_writeable(pte) \
530 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
531 
532 #define pte_set_was_writeable(pte, was_writeable) \
533 	do {                                         \
534 	        if ((was_writeable)) {               \
535 	                (pte) |= ARM_PTE_WRITEABLE;  \
536 	        } else {                             \
537 	                (pte) &= ~ARM_PTE_WRITEABLE; \
538 	        }                                    \
539 	} while(0)
540 
541 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)542 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
543 {
544 	if (wired) {
545 		*ptep |= ARM_PTE_WIRED;
546 	} else {
547 		*ptep &= ~ARM_PTE_WIRED;
548 	}
549 	/*
550 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
551 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
552 	 * never reclaimed.
553 	 */
554 	if (pmap == kernel_pmap) {
555 		return;
556 	}
557 	unsigned short *ptd_wiredcnt_ptr;
558 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
559 	if (wired) {
560 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
561 	} else {
562 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
563 		if (__improbable(prev_wired == 0)) {
564 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
565 		}
566 	}
567 }
568 
569 #if HAS_FEAT_XS
570 
571 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)572 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
573 {
574 	if (__improbable(pt_attr->stage2)) {
575 		return false;
576 	}
577 	switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
578 	case CACHE_ATTRINDX_POSTED_XS:
579 	case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
580 		return true;
581 	default:
582 		return false;
583 	}
584 }
585 
586 #endif /* HAS_FEAT_XS */
587 
588 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                               \
589 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
590 	arm64_sync_tlb(strong);                                                                               \
591 }
592 
593 /*
594  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
595  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
596  * will observe the updated PTE.
597  */
598 #define FLUSH_PTE()                                                                     \
599 	__builtin_arm_dmb(DMB_ISH);
600 
601 /*
602  * Synchronize updates to PTEs that were previously valid and thus may be cached in
603  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
604  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
605  * program order will not issue until the DSB completes.  Prior loads may be reordered
606  * after the barrier, but their behavior should not be materially affected by the
607  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
608  * matter for loads until the access is re-driven well after the TLB update is
609  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
610  * we should be in a position to handle access faults.  For "voluntary" PTE access
611  * restriction due to unmapping or protection, the decision to restrict access should
612  * have a data dependency on prior loads in order to avoid a data race.
613  */
614 #define FLUSH_PTE_STRONG()                                                             \
615 	__builtin_arm_dsb(DSB_ISHST);
616 
617 /**
618  * Write enough page table entries to map a single VM page. On systems where the
619  * VM page size does not match the hardware page size, multiple page table
620  * entries will need to be written.
621  *
622  * @note This function does not emit a barrier to ensure these page table writes
623  *       have completed before continuing. This is commonly needed. In the case
624  *       where a DMB or DSB barrier is needed, then use the write_pte() and
625  *       write_pte_strong() functions respectively instead of this one.
626  *
627  * @param ptep Pointer to the first page table entry to update.
628  * @param pte The value to write into each page table entry. In the case that
629  *            multiple PTEs are updated to a non-empty value, then the address
630  *            in this value will automatically be incremented for each PTE
631  *            write.
632  */
633 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)634 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
635 {
636 	/**
637 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
638 	 * systems, which is why it's checked at runtime instead of compile time.
639 	 * The "unreachable" warning needs to be suppressed because it still is a
640 	 * compile time constant on some systems.
641 	 */
642 	__unreachable_ok_push
643 	if (TEST_PAGE_RATIO_4) {
644 		if (((uintptr_t)ptep) & 0x1f) {
645 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
646 			    __func__, ptep, (void*)pte);
647 		}
648 
649 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
650 			/**
651 			 * If we're writing an empty/compressed PTE value, then don't
652 			 * auto-increment the address for each PTE write.
653 			 */
654 			*ptep = pte;
655 			*(ptep + 1) = pte;
656 			*(ptep + 2) = pte;
657 			*(ptep + 3) = pte;
658 		} else {
659 			*ptep = pte;
660 			*(ptep + 1) = pte | 0x1000;
661 			*(ptep + 2) = pte | 0x2000;
662 			*(ptep + 3) = pte | 0x3000;
663 		}
664 	} else {
665 		*ptep = pte;
666 	}
667 	__unreachable_ok_pop
668 }
669 
670 /**
671  * Writes enough page table entries to map a single VM page and then ensures
672  * those writes complete by executing a Data Memory Barrier.
673  *
674  * @note The DMB issued by this function is not strong enough to protect against
675  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
676  *       instruction is going to immediately be called after this write, it's
677  *       recommended to call write_pte_strong() instead of this function.
678  *
679  * See the function header for write_pte_fast() for more details on the
680  * parameters.
681  */
682 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)683 write_pte(pt_entry_t *ptep, pt_entry_t pte)
684 {
685 	write_pte_fast(ptep, pte);
686 	FLUSH_PTE();
687 }
688 
689 /**
690  * Writes enough page table entries to map a single VM page and then ensures
691  * those writes complete by executing a Data Synchronization Barrier. This
692  * barrier provides stronger guarantees than the DMB executed by write_pte().
693  *
694  * @note This function is useful if you're going to immediately flush the TLB
695  *       after making the PTE write. A DSB is required to protect against the
696  *       TLB invalidate being reordered before the PTE write.
697  *
698  * See the function header for write_pte_fast() for more details on the
699  * parameters.
700  */
701 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)702 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
703 {
704 	write_pte_fast(ptep, pte);
705 	FLUSH_PTE_STRONG();
706 }
707 
708 /**
709  * Retrieve the pmap structure for the thread running on the current CPU.
710  */
711 pmap_t
current_pmap()712 current_pmap()
713 {
714 	const pmap_t current = vm_map_pmap(current_thread()->map);
715 
716 	assert(current != NULL);
717 
718 #if XNU_MONITOR
719 	/**
720 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
721 	 * decided by kernel-writable memory. This function is used in various parts
722 	 * of the PPL, and besides validating that the pointer returned by this
723 	 * function is indeed a pmap structure, it's also important to ensure that
724 	 * it's actually the current thread's pmap. This is because different pmaps
725 	 * will have access to different entitlements based on the code signature of
726 	 * their loaded process. So if a different user pmap is set in the current
727 	 * thread structure (in an effort to bypass code signing restrictions), even
728 	 * though the structure would validate correctly as it is a real pmap
729 	 * structure, it should fail here.
730 	 *
731 	 * This only needs to occur for user pmaps because the kernel pmap's root
732 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
733 	 * changed so it'd be redundant to check), and its code signing fields are
734 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
735 	 * it shouldn't be possible to set those fields. Due to that, an attacker
736 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
737 	 * this check won't accomplish anything as it doesn't provide any extra code
738 	 * signing entitlements.
739 	 */
740 	if ((current != kernel_pmap) &&
741 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
742 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
743 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
744 	}
745 #endif /* XNU_MONITOR */
746 
747 	return current;
748 }
749 
750 #if DEVELOPMENT || DEBUG
751 
752 /*
753  * Trace levels are controlled by a bitmask in which each
754  * level can be enabled/disabled by the (1<<level) position
755  * in the boot arg
756  * Level 0: PPL extension functionality
757  * Level 1: pmap lifecycle (create/destroy/switch)
758  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
759  * Level 3: internal state management (attributes/fast-fault)
760  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
761  */
762 
763 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
764 
765 #define PMAP_TRACE(level, ...) \
766 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
767 	        KDBG_RELEASE(__VA_ARGS__); \
768 	}
769 #else /* DEVELOPMENT || DEBUG */
770 
771 #define PMAP_TRACE(level, ...)
772 
773 #endif /* DEVELOPMENT || DEBUG */
774 
775 
776 /*
777  * Internal function prototypes (forward declarations).
778  */
779 
780 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
781 
782 static void pmap_set_reference(ppnum_t pn);
783 
784 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
785 
786 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
787 
788 static kern_return_t pmap_expand(
789 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
790 
791 static int pmap_remove_range(
792 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
793 
794 static tt_entry_t *pmap_tt1_allocate(
795 	pmap_t, vm_size_t, unsigned int);
796 
797 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
798 
799 static void pmap_tt1_deallocate(
800 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
801 
802 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
803 
804 static kern_return_t pmap_tt_allocate(
805 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
806 
807 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
808 
809 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
810 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
811 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
812 
813 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
814 
815 
816 static void pmap_unmap_commpage(
817 	pmap_t pmap);
818 
819 static boolean_t
820 pmap_is_64bit(pmap_t);
821 
822 
823 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
824 
825 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
826 
827 static bool pmap_update_cache_attributes_locked(
828 	ppnum_t, unsigned, bool);
829 
830 static boolean_t arm_clear_fast_fault(
831 	ppnum_t ppnum,
832 	vm_prot_t fault_type,
833 	pt_entry_t *pte_p);
834 
835 static void pmap_trim_self(pmap_t pmap);
836 static void pmap_trim_subord(pmap_t subord);
837 
838 
839 /*
840  * Temporary prototypes, while we wait for pmap_enter to move to taking an
841  * address instead of a page number.
842  */
843 static kern_return_t
844 pmap_enter_addr(
845 	pmap_t pmap,
846 	vm_map_address_t v,
847 	pmap_paddr_t pa,
848 	vm_prot_t prot,
849 	vm_prot_t fault_type,
850 	unsigned int flags,
851 	boolean_t wired);
852 
853 kern_return_t
854 pmap_enter_options_addr(
855 	pmap_t pmap,
856 	vm_map_address_t v,
857 	pmap_paddr_t pa,
858 	vm_prot_t prot,
859 	vm_prot_t fault_type,
860 	unsigned int flags,
861 	boolean_t wired,
862 	unsigned int options,
863 	__unused void   *arg,
864 	__unused pmap_mapping_type_t mapping_type);
865 
866 #ifdef CONFIG_XNUPOST
867 kern_return_t pmap_test(void);
868 #endif /* CONFIG_XNUPOST */
869 
870 PMAP_SUPPORT_PROTOTYPES(
871 	kern_return_t,
872 	arm_fast_fault, (pmap_t pmap,
873 	vm_map_address_t va,
874 	vm_prot_t fault_type,
875 	bool was_af_fault,
876 	bool from_user), ARM_FAST_FAULT_INDEX);
877 
878 PMAP_SUPPORT_PROTOTYPES(
879 	boolean_t,
880 	arm_force_fast_fault, (ppnum_t ppnum,
881 	vm_prot_t allow_mode,
882 	int options), ARM_FORCE_FAST_FAULT_INDEX);
883 
884 MARK_AS_PMAP_TEXT static boolean_t
885 arm_force_fast_fault_with_flush_range(
886 	ppnum_t ppnum,
887 	vm_prot_t allow_mode,
888 	int options,
889 	pmap_tlb_flush_range_t *flush_range);
890 
891 /**
892  * Definition of the states driving the batch cache attributes update
893  * state machine.
894  */
895 typedef struct {
896 	uint64_t page_index : 32,           /* The page index to be operated on */
897 	    state : 8,                      /* The current state of the update machine */
898 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
899 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
900 	:0;
901 } batch_set_cache_attr_state_t;
902 
903 /* Possible values of the "state" field. */
904 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
905 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
906 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
907 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
908 
909 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
910 
911 PMAP_SUPPORT_PROTOTYPES(
912 	batch_set_cache_attr_state_t,
913 	pmap_batch_set_cache_attributes, (
914 #if XNU_MONITOR
915 		volatile upl_page_info_t *user_page_list,
916 #else /* !XNU_MONITOR */
917 		upl_page_info_array_t user_page_list,
918 #endif /* XNU_MONITOR */
919 		batch_set_cache_attr_state_t state,
920 		unsigned int page_cnt,
921 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
922 
923 PMAP_SUPPORT_PROTOTYPES(
924 	kern_return_t,
925 	pmap_change_wiring, (pmap_t pmap,
926 	vm_map_address_t v,
927 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
928 
929 PMAP_SUPPORT_PROTOTYPES(
930 	pmap_t,
931 	pmap_create_options, (ledger_t ledger,
932 	vm_map_size_t size,
933 	unsigned int flags,
934 	kern_return_t * kr), PMAP_CREATE_INDEX);
935 
936 PMAP_SUPPORT_PROTOTYPES(
937 	void,
938 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
939 
940 PMAP_SUPPORT_PROTOTYPES(
941 	kern_return_t,
942 	pmap_enter_options, (pmap_t pmap,
943 	vm_map_address_t v,
944 	pmap_paddr_t pa,
945 	vm_prot_t prot,
946 	vm_prot_t fault_type,
947 	unsigned int flags,
948 	boolean_t wired,
949 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
950 
951 PMAP_SUPPORT_PROTOTYPES(
952 	pmap_paddr_t,
953 	pmap_find_pa, (pmap_t pmap,
954 	addr64_t va), PMAP_FIND_PA_INDEX);
955 
956 PMAP_SUPPORT_PROTOTYPES(
957 	kern_return_t,
958 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
959 
960 
961 PMAP_SUPPORT_PROTOTYPES(
962 	boolean_t,
963 	pmap_is_empty, (pmap_t pmap,
964 	vm_map_offset_t va_start,
965 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
966 
967 
968 PMAP_SUPPORT_PROTOTYPES(
969 	unsigned int,
970 	pmap_map_cpu_windows_copy, (ppnum_t pn,
971 	vm_prot_t prot,
972 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
973 
974 PMAP_SUPPORT_PROTOTYPES(
975 	void,
976 	pmap_ro_zone_memcpy, (zone_id_t zid,
977 	vm_offset_t va,
978 	vm_offset_t offset,
979 	const vm_offset_t new_data,
980 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
981 
982 PMAP_SUPPORT_PROTOTYPES(
983 	uint64_t,
984 	pmap_ro_zone_atomic_op, (zone_id_t zid,
985 	vm_offset_t va,
986 	vm_offset_t offset,
987 	zro_atomic_op_t op,
988 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
989 
990 PMAP_SUPPORT_PROTOTYPES(
991 	void,
992 	pmap_ro_zone_bzero, (zone_id_t zid,
993 	vm_offset_t va,
994 	vm_offset_t offset,
995 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
996 
997 PMAP_SUPPORT_PROTOTYPES(
998 	vm_map_offset_t,
999 	pmap_nest, (pmap_t grand,
1000 	pmap_t subord,
1001 	addr64_t vstart,
1002 	uint64_t size,
1003 	vm_map_offset_t vrestart,
1004 	kern_return_t * krp), PMAP_NEST_INDEX);
1005 
1006 PMAP_SUPPORT_PROTOTYPES(
1007 	void,
1008 	pmap_page_protect_options, (ppnum_t ppnum,
1009 	vm_prot_t prot,
1010 	unsigned int options,
1011 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1012 
1013 PMAP_SUPPORT_PROTOTYPES(
1014 	vm_map_address_t,
1015 	pmap_protect_options, (pmap_t pmap,
1016 	vm_map_address_t start,
1017 	vm_map_address_t end,
1018 	vm_prot_t prot,
1019 	unsigned int options,
1020 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1021 
1022 PMAP_SUPPORT_PROTOTYPES(
1023 	kern_return_t,
1024 	pmap_query_page_info, (pmap_t pmap,
1025 	vm_map_offset_t va,
1026 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1027 
1028 PMAP_SUPPORT_PROTOTYPES(
1029 	mach_vm_size_t,
1030 	pmap_query_resident, (pmap_t pmap,
1031 	vm_map_address_t start,
1032 	vm_map_address_t end,
1033 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1034 
1035 PMAP_SUPPORT_PROTOTYPES(
1036 	void,
1037 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1038 
1039 PMAP_SUPPORT_PROTOTYPES(
1040 	vm_map_address_t,
1041 	pmap_remove_options, (pmap_t pmap,
1042 	vm_map_address_t start,
1043 	vm_map_address_t end,
1044 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1045 
1046 
1047 PMAP_SUPPORT_PROTOTYPES(
1048 	void,
1049 	pmap_set_cache_attributes, (ppnum_t pn,
1050 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1051 
1052 PMAP_SUPPORT_PROTOTYPES(
1053 	void,
1054 	pmap_update_compressor_page, (ppnum_t pn,
1055 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1056 
1057 PMAP_SUPPORT_PROTOTYPES(
1058 	void,
1059 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1060 
1061 #if MACH_ASSERT || XNU_MONITOR
1062 PMAP_SUPPORT_PROTOTYPES(
1063 	void,
1064 	pmap_set_process, (pmap_t pmap,
1065 	int pid,
1066 	char *procname), PMAP_SET_PROCESS_INDEX);
1067 #endif
1068 
1069 PMAP_SUPPORT_PROTOTYPES(
1070 	void,
1071 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1072 
1073 PMAP_SUPPORT_PROTOTYPES(
1074 	vm_map_offset_t,
1075 	pmap_unnest_options, (pmap_t grand,
1076 	addr64_t vaddr,
1077 	uint64_t size,
1078 	vm_map_offset_t vrestart,
1079 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1080 
1081 PMAP_SUPPORT_PROTOTYPES(
1082 	void,
1083 	phys_attribute_set, (ppnum_t pn,
1084 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1085 
1086 PMAP_SUPPORT_PROTOTYPES(
1087 	void,
1088 	phys_attribute_clear, (ppnum_t pn,
1089 	unsigned int bits,
1090 	int options,
1091 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1092 
1093 #if __ARM_RANGE_TLBI__
1094 PMAP_SUPPORT_PROTOTYPES(
1095 	vm_map_address_t,
1096 	phys_attribute_clear_range, (pmap_t pmap,
1097 	vm_map_address_t start,
1098 	vm_map_address_t end,
1099 	unsigned int bits,
1100 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1101 #endif /* __ARM_RANGE_TLBI__ */
1102 
1103 
1104 PMAP_SUPPORT_PROTOTYPES(
1105 	void,
1106 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1107 
1108 PMAP_SUPPORT_PROTOTYPES(
1109 	void,
1110 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1111 
1112 PMAP_SUPPORT_PROTOTYPES(
1113 	void,
1114 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1115 
1116 PMAP_SUPPORT_PROTOTYPES(
1117 	void,
1118 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1119 
1120 PMAP_SUPPORT_PROTOTYPES(
1121 	void,
1122 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1123 
1124 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1125 PMAP_SUPPORT_PROTOTYPES(
1126 	void,
1127 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1128 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1129 
1130 /* Definition of the states used by pmap_trim(). */
1131 typedef enum {
1132 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1133 	PMAP_TRIM_STATE_START = 0,
1134 
1135 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1136 	PMAP_TRIM_STATE_GRAND_BEFORE,
1137 
1138 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1139 	PMAP_TRIM_STATE_GRAND_AFTER,
1140 
1141 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1142 	PMAP_TRIM_STATE_SUBORD,
1143 
1144 	/* Marks that trimming is finished. */
1145 	PMAP_TRIM_STATE_DONE,
1146 
1147 	/* Sentry enum for sanity checks. */
1148 	PMAP_TRIM_STATE_COUNT,
1149 } pmap_trim_state_t;
1150 
1151 PMAP_SUPPORT_PROTOTYPES(
1152 	pmap_trim_state_t,
1153 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1154 
1155 #if HAS_APPLE_PAC
1156 PMAP_SUPPORT_PROTOTYPES(
1157 	void *,
1158 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1159 PMAP_SUPPORT_PROTOTYPES(
1160 	void *,
1161 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1162 #endif /* HAS_APPLE_PAC */
1163 
1164 
1165 
1166 
1167 PMAP_SUPPORT_PROTOTYPES(
1168 	kern_return_t,
1169 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1170 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1171 
1172 PMAP_SUPPORT_PROTOTYPES(
1173 	kern_return_t,
1174 	pmap_load_trust_cache_with_type, (TCType_t type,
1175 	const vm_address_t pmap_img4_payload,
1176 	const vm_size_t pmap_img4_payload_len,
1177 	const vm_address_t img4_manifest,
1178 	const vm_size_t img4_manifest_len,
1179 	const vm_address_t img4_aux_manifest,
1180 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1181 
1182 PMAP_SUPPORT_PROTOTYPES(
1183 	void,
1184 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1185 
1186 PMAP_SUPPORT_PROTOTYPES(
1187 	kern_return_t,
1188 	pmap_query_trust_cache, (TCQueryType_t query_type,
1189 	const uint8_t cdhash[kTCEntryHashSize],
1190 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1191 
1192 #if PMAP_CS_INCLUDE_CODE_SIGNING
1193 
1194 PMAP_SUPPORT_PROTOTYPES(
1195 	kern_return_t,
1196 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1197 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1198 
1199 PMAP_SUPPORT_PROTOTYPES(
1200 	kern_return_t,
1201 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1202 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1203 
1204 PMAP_SUPPORT_PROTOTYPES(
1205 	kern_return_t,
1206 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1207 	pmap_cs_profile_t * profile_obj),
1208 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1209 
1210 PMAP_SUPPORT_PROTOTYPES(
1211 	kern_return_t,
1212 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1213 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1214 
1215 PMAP_SUPPORT_PROTOTYPES(
1216 	kern_return_t,
1217 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1218 	const void *kernel_entitlements),
1219 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1220 
1221 PMAP_SUPPORT_PROTOTYPES(
1222 	kern_return_t,
1223 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1224 	const void **kernel_entitlements),
1225 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1226 
1227 PMAP_SUPPORT_PROTOTYPES(
1228 	kern_return_t,
1229 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1230 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1231 
1232 PMAP_SUPPORT_PROTOTYPES(
1233 	kern_return_t,
1234 	pmap_cs_allow_invalid, (pmap_t pmap),
1235 	PMAP_CS_ALLOW_INVALID_INDEX);
1236 
1237 PMAP_SUPPORT_PROTOTYPES(
1238 	void,
1239 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1240 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1241 
1242 PMAP_SUPPORT_PROTOTYPES(
1243 	bool,
1244 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1245 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1246 
1247 PMAP_SUPPORT_PROTOTYPES(
1248 	void,
1249 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1250 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1251 
1252 PMAP_SUPPORT_PROTOTYPES(
1253 	void,
1254 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1255 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1256 
1257 #endif
1258 
1259 PMAP_SUPPORT_PROTOTYPES(
1260 	uint32_t,
1261 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1262 
1263 PMAP_SUPPORT_PROTOTYPES(
1264 	bool,
1265 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1266 
1267 PMAP_SUPPORT_PROTOTYPES(
1268 	void,
1269 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1270 
1271 void pmap_footprint_suspend(vm_map_t    map,
1272     boolean_t   suspend);
1273 PMAP_SUPPORT_PROTOTYPES(
1274 	void,
1275 	pmap_footprint_suspend, (vm_map_t map,
1276 	boolean_t suspend),
1277 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1278 
1279 
1280 
1281 
1282 #if DEVELOPMENT || DEBUG
1283 PMAP_SUPPORT_PROTOTYPES(
1284 	kern_return_t,
1285 	pmap_test_text_corruption, (pmap_paddr_t),
1286 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1287 #endif /* DEVELOPMENT || DEBUG */
1288 
1289 /*
1290  * The low global vector page is mapped at a fixed alias.
1291  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1292  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1293  * to check both addresses anyway for backward compatibility. So for now
1294  * we leave H6 and H7 where they were.
1295  */
1296 #if (ARM_PGSHIFT == 14)
1297 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1298 #else
1299 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1300 #endif
1301 
1302 
1303 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1304 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1305 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1306 
1307 #if XNU_MONITOR
1308 
1309 #if __has_feature(ptrauth_calls)
1310 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1311 #else
1312 #define __ptrauth_ppl_handler
1313 #endif
1314 
1315 /*
1316  * Table of function pointers used for PPL dispatch.
1317  */
1318 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1319 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1320 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1321 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1322 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1323 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1324 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1325 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1326 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1327 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1328 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1329 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1330 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1331 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1332 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1333 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1334 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1335 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1336 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1337 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1338 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1339 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1340 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1341 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1342 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1343 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1344 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1345 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1346 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1347 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1348 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1349 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1350 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1351 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1352 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1353 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1354 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1355 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1356 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1357 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1358 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1359 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1360 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1361 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1362 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1363 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1364 #if PMAP_CS_INCLUDE_CODE_SIGNING
1365 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1366 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1367 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1368 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1369 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1370 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1371 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1372 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1373 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1374 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1375 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1376 #endif
1377 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1378 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1379 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1380 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1381 #if HAS_APPLE_PAC
1382 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1383 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1384 #endif /* HAS_APPLE_PAC */
1385 #if __ARM_RANGE_TLBI__
1386 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1387 #endif /* __ARM_RANGE_TLBI__ */
1388 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1389 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1390 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1391 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1392 
1393 #if DEVELOPMENT || DEBUG
1394 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1395 #endif /* DEVELOPMENT || DEBUG */
1396 
1397 };
1398 #endif
1399 
1400 #if XNU_MONITOR
1401 /**
1402  * A convenience function for setting protections on a single physical
1403  * aperture or static region mapping without invalidating the TLB.
1404  *
1405  * @note This function does not perform any TLB invalidations. That must be done
1406  *       separately to be able to safely use the updated mapping.
1407  *
1408  * @note This function understands the difference between the VM page size and
1409  *       the kernel page size and will update multiple PTEs if the sizes differ.
1410  *       In other words, enough PTEs will always get updated to change the
1411  *       permissions on a PAGE_SIZE amount of memory.
1412  *
1413  * @note The PVH lock for the physical page represented by this mapping must
1414  *       already be locked.
1415  *
1416  * @note This function assumes the caller has already verified that the PTE
1417  *       pointer does indeed point to a physical aperture or static region page
1418  *       table. Please validate your inputs before passing it along to this
1419  *       function.
1420  *
1421  * @param ptep Pointer to the physical aperture or static region page table to
1422  *             update with a new XPRR index.
1423  * @param expected_perm The XPRR index that is expected to already exist at the
1424  *                      current mapping. If the current index doesn't match this
1425  *                      then the system will panic.
1426  * @param new_perm The new XPRR index to update the mapping with.
1427  */
1428 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1429 pmap_set_pte_xprr_perm(
1430 	pt_entry_t * const ptep,
1431 	unsigned int expected_perm,
1432 	unsigned int new_perm)
1433 {
1434 	assert(ptep != NULL);
1435 
1436 	pt_entry_t spte = *ptep;
1437 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1438 
1439 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1440 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1441 		    __func__, ptep, new_perm, expected_perm);
1442 	}
1443 
1444 	/**
1445 	 * The PTE involved should be valid, should not have the hint bit set, and
1446 	 * should have the expected XPRR index.
1447 	 */
1448 	if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1449 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1450 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1451 		    __func__, ptep, spte, new_perm, expected_perm);
1452 	}
1453 
1454 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1455 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1456 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1457 		    __func__, ptep, spte, new_perm, expected_perm);
1458 	}
1459 
1460 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1461 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1462 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1463 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1464 	}
1465 
1466 	pt_entry_t template = spte;
1467 	template &= ~ARM_PTE_XPRR_MASK;
1468 	template |= xprr_perm_to_pte(new_perm);
1469 
1470 	write_pte_strong(ptep, template);
1471 }
1472 
1473 /**
1474  * Update the protections on a single physical aperture mapping and invalidate
1475  * the TLB so the mapping can be used.
1476  *
1477  * @note The PVH lock for the physical page must already be locked.
1478  *
1479  * @param pai The physical address index of the page whose physical aperture
1480  *            mapping will be updated with new permissions.
1481  * @param expected_perm The XPRR index that is expected to already exist at the
1482  *                      current mapping. If the current index doesn't match this
1483  *                      then the system will panic.
1484  * @param new_perm The new XPRR index to update the mapping with.
1485  */
1486 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1487 pmap_set_xprr_perm(
1488 	unsigned int pai,
1489 	unsigned int expected_perm,
1490 	unsigned int new_perm)
1491 {
1492 	pvh_assert_locked(pai);
1493 
1494 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1495 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1496 
1497 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1498 
1499 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1500 	sync_tlb_flush();
1501 }
1502 
1503 /**
1504  * Update the protections on a range of physical aperture or static region
1505  * mappings and invalidate the TLB so the mappings can be used.
1506  *
1507  * @note Static region mappings can only be updated before machine_lockdown().
1508  *       Physical aperture mappings can be updated at any time.
1509  *
1510  * @param start The starting virtual address of the static region or physical
1511  *              aperture range whose permissions will be updated.
1512  * @param end The final (inclusive) virtual address of the static region or
1513  *            physical aperture range whose permissions will be updated.
1514  * @param expected_perm The XPRR index that is expected to already exist at the
1515  *                      current mappings. If the current indices don't match
1516  *                      this then the system will panic.
1517  * @param new_perm The new XPRR index to update the mappings with.
1518  */
1519 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1520 pmap_set_range_xprr_perm(
1521 	vm_address_t start,
1522 	vm_address_t end,
1523 	unsigned int expected_perm,
1524 	unsigned int new_perm)
1525 {
1526 	/**
1527 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1528 	 */
1529 	if (__improbable((start | end) & ARM_PGMASK)) {
1530 		panic_plain("%s: start or end not page aligned, "
1531 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1532 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1533 	}
1534 
1535 	if (__improbable(start > end)) {
1536 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1537 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1538 	}
1539 
1540 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1541 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1542 
1543 	if (__improbable(!(in_physmap || in_static))) {
1544 		panic_plain("%s: address not in static region or physical aperture, "
1545 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1546 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1547 	}
1548 
1549 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1550 		panic_plain("%s: invalid XPRR index, "
1551 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1552 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1553 	}
1554 
1555 	/*
1556 	 * Walk over the PTEs for the given range, and set the protections on those
1557 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1558 	 * one twig entry (whichever twig entry currently maps "va").
1559 	 */
1560 	vm_address_t va = start;
1561 	while (va < end) {
1562 		/**
1563 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1564 		 * PTEs from va to tte_va_end will have their permissions updated.
1565 		 */
1566 		vm_address_t tte_va_end =
1567 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1568 
1569 		if (tte_va_end > end) {
1570 			tte_va_end = end;
1571 		}
1572 
1573 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1574 
1575 		if (ttep == NULL) {
1576 			panic_plain("%s: physical aperture or static region tte is NULL, "
1577 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1578 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1579 		}
1580 
1581 		tt_entry_t tte = *ttep;
1582 
1583 		if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1584 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1585 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1586 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1587 		}
1588 
1589 		/* Walk over the given L3 page table page and update the PTEs. */
1590 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1591 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1592 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1593 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1594 
1595 		/**
1596 		 * The current PTE pointer is incremented by the page ratio (ratio of
1597 		 * VM page size to kernel hardware page size) because one call to
1598 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1599 		 * a PAGE_SIZE worth of hardware pages.
1600 		 */
1601 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1602 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1603 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1604 			pvh_lock(pai);
1605 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1606 			pvh_unlock(pai);
1607 		}
1608 
1609 		va = tte_va_end;
1610 	}
1611 
1612 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1613 }
1614 
1615 #endif /* XNU_MONITOR */
1616 
1617 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1618 PMAP_ZINFO_PALLOC(
1619 	pmap_t pmap, int bytes)
1620 {
1621 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1622 }
1623 
1624 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1625 PMAP_ZINFO_PFREE(
1626 	pmap_t pmap,
1627 	int bytes)
1628 {
1629 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1630 }
1631 
1632 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1633 pmap_tt_ledger_credit(
1634 	pmap_t          pmap,
1635 	vm_size_t       size)
1636 {
1637 	if (pmap != kernel_pmap) {
1638 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1639 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1640 	}
1641 }
1642 
1643 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1644 pmap_tt_ledger_debit(
1645 	pmap_t          pmap,
1646 	vm_size_t       size)
1647 {
1648 	if (pmap != kernel_pmap) {
1649 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1650 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1651 	}
1652 }
1653 
1654 static inline void
pmap_update_plru(uint16_t asid_index __unused)1655 pmap_update_plru(uint16_t asid_index __unused)
1656 {
1657 #if !HAS_16BIT_ASID
1658 	if (__probable(pmap_asid_plru)) {
1659 		unsigned plru_index = asid_index >> 6;
1660 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1661 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1662 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1663 		}
1664 	}
1665 #endif /* !HAS_16BIT_ASID */
1666 }
1667 
1668 static bool
alloc_asid(pmap_t pmap)1669 alloc_asid(pmap_t pmap)
1670 {
1671 	int vasid = -1;
1672 	uint16_t hw_asid;
1673 
1674 	pmap_simple_lock(&asid_lock);
1675 
1676 #if !HAS_16BIT_ASID
1677 	if (__probable(pmap_asid_plru)) {
1678 		unsigned plru_index = 0;
1679 		uint64_t lowest_gen = asid_plru_generation[0];
1680 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1681 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1682 			if (asid_plru_generation[i] < lowest_gen) {
1683 				plru_index = i;
1684 				lowest_gen = asid_plru_generation[i];
1685 				lowest_gen_bitmap = asid_plru_bitmap[i];
1686 			}
1687 		}
1688 
1689 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1690 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1691 			if (temp_plru) {
1692 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1693 #if DEVELOPMENT || DEBUG
1694 				++pmap_asid_hits;
1695 #endif
1696 				break;
1697 			}
1698 		}
1699 	}
1700 #else
1701 	/**
1702 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1703 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1704 	 * However, we first try to allocate starting from the position of the most-recently allocated
1705 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1706 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1707 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1708 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1709 	 * logic, without requiring prohibitively expensive RCTX instructions.
1710 	 */
1711 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1712 #endif /* !HAS_16BIT_ASID */
1713 	if (__improbable(vasid < 0)) {
1714 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1715 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1716 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1717 #if DEVELOPMENT || DEBUG
1718 		++pmap_asid_misses;
1719 #endif
1720 	}
1721 	if (__improbable(vasid < 0)) {
1722 		pmap_simple_unlock(&asid_lock);
1723 		return false;
1724 	}
1725 	assert((uint32_t)vasid < pmap_max_asids);
1726 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1727 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1728 #if HAS_16BIT_ASID
1729 	last_allocated_asid = (uint16_t)vasid;
1730 #endif /* HAS_16BIT_ASID */
1731 	pmap_simple_unlock(&asid_lock);
1732 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1733 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1734 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1735 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1736 		 * reassign to a reserved VASID. */
1737 		assert(pmap->sw_asid < UINT8_MAX);
1738 		pmap->sw_asid = UINT8_MAX;
1739 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1740 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1741 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1742 		assert(hw_asid < MAX_HW_ASIDS);
1743 	}
1744 	pmap_update_plru(hw_asid);
1745 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1746 #if __ARM_KERNEL_PROTECT__
1747 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1748 #endif
1749 	pmap->hw_asid = hw_asid;
1750 	return true;
1751 }
1752 
1753 static void
free_asid(pmap_t pmap)1754 free_asid(pmap_t pmap)
1755 {
1756 	unsigned int vasid;
1757 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1758 	if (__improbable(hw_asid == 0)) {
1759 		return;
1760 	}
1761 
1762 #if __ARM_KERNEL_PROTECT__
1763 	hw_asid >>= 1;
1764 #endif
1765 	hw_asid -= 1;
1766 
1767 #if HAS_16BIT_ASID
1768 	vasid = hw_asid;
1769 #else
1770 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1771 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1772 	} else {
1773 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1774 	}
1775 
1776 	if (__probable(pmap_asid_plru)) {
1777 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1778 	}
1779 #endif /* HAS_16BIT_ASID */
1780 	pmap_simple_lock(&asid_lock);
1781 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1782 	bitmap_set(&asid_bitmap[0], vasid);
1783 	pmap_simple_unlock(&asid_lock);
1784 }
1785 
1786 
1787 boolean_t
pmap_valid_address(pmap_paddr_t addr)1788 pmap_valid_address(
1789 	pmap_paddr_t addr)
1790 {
1791 	return pa_valid(addr);
1792 }
1793 
1794 
1795 
1796 
1797 
1798 
1799 /*
1800  *      Map memory at initialization.  The physical addresses being
1801  *      mapped are not managed and are never unmapped.
1802  *
1803  *      For now, VM is already on, we only need to map the
1804  *      specified memory.
1805  */
1806 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1807 pmap_map(
1808 	vm_map_address_t virt,
1809 	vm_offset_t start,
1810 	vm_offset_t end,
1811 	vm_prot_t prot,
1812 	unsigned int flags)
1813 {
1814 	kern_return_t   kr;
1815 	vm_size_t       ps;
1816 
1817 	ps = PAGE_SIZE;
1818 	while (start < end) {
1819 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1820 		    prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1821 
1822 		if (kr != KERN_SUCCESS) {
1823 			panic("%s: failed pmap_enter, "
1824 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1825 			    __FUNCTION__,
1826 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1827 		}
1828 
1829 		virt += ps;
1830 		start += ps;
1831 	}
1832 	return virt;
1833 }
1834 
1835 #if XNU_MONITOR
1836 /**
1837  * Remove kernel writeablity from an IO PTE value if the page is owned by
1838  * guarded mode software.
1839  *
1840  * @param paddr The physical address of the page which has to be non-DRAM.
1841  * @param tmplate The PTE value to be evaluated.
1842  *
1843  * @return A new PTE value with permission bits modified.
1844  */
1845 static inline
1846 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1847 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1848 {
1849 	assert(!pa_valid(paddr));
1850 
1851 	const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1852 
1853 	if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1854 		/* PPL to own the page by converting KERN_RW to PPL_RW. */
1855 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1856 		switch (xprr_perm) {
1857 		case XPRR_KERN_RO_PERM:
1858 			break;
1859 		case XPRR_KERN_RW_PERM:
1860 			tmplate &= ~ARM_PTE_XPRR_MASK;
1861 			tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1862 			break;
1863 		default:
1864 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1865 		}
1866 	}
1867 
1868 	return tmplate;
1869 }
1870 #endif /* XNU_MONITOR */
1871 
1872 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1873 pmap_map_bd_with_options(
1874 	vm_map_address_t virt,
1875 	vm_offset_t start,
1876 	vm_offset_t end,
1877 	vm_prot_t prot,
1878 	int32_t options)
1879 {
1880 	pt_entry_t      mem_attr;
1881 
1882 	switch (options & PMAP_MAP_BD_MASK) {
1883 	case PMAP_MAP_BD_WCOMB:
1884 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1885 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1886 		break;
1887 	case PMAP_MAP_BD_POSTED:
1888 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1889 		break;
1890 	case PMAP_MAP_BD_POSTED_REORDERED:
1891 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1892 		break;
1893 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1894 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1895 		break;
1896 	default:
1897 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1898 		break;
1899 	}
1900 
1901 	/* not cacheable and not buffered */
1902 	pt_entry_t tmplate = pa_to_pte(start)
1903 	    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1904 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1905 	    | mem_attr;
1906 
1907 #if __ARM_KERNEL_PROTECT__
1908 	tmplate |= ARM_PTE_NG;
1909 #endif /* __ARM_KERNEL_PROTECT__ */
1910 
1911 	vm_map_address_t vaddr = virt;
1912 	vm_offset_t paddr = start;
1913 	while (paddr < end) {
1914 		pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
1915 		if (ptep == PT_ENTRY_NULL) {
1916 			panic("pmap_map_bd");
1917 		}
1918 
1919 		/**
1920 		 * For every iteration, the paddr encoded in tmplate is incrementing,
1921 		 * but we always start with the original AP bits defined at the top
1922 		 * of the function in tmplate and only modify the AP bits in the pte
1923 		 * variable.
1924 		 */
1925 		pt_entry_t pte;
1926 #if XNU_MONITOR
1927 		if (!pa_valid(paddr)) {
1928 			pte = pmap_construct_io_pte(paddr, tmplate);
1929 		} else {
1930 			pte = tmplate;
1931 		}
1932 #else /* !XNU_MONITOR */
1933 		pte = tmplate;
1934 #endif
1935 
1936 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1937 		write_pte_strong(ptep, pte);
1938 
1939 		pte_increment_pa(tmplate);
1940 		vaddr += PAGE_SIZE;
1941 		paddr += PAGE_SIZE;
1942 	}
1943 
1944 	if (end >= start) {
1945 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1946 	}
1947 
1948 	return vaddr;
1949 }
1950 
1951 /*
1952  *      Back-door routine for mapping kernel VM at initialization.
1953  *      Useful for mapping memory outside the range
1954  *      [vm_first_phys, vm_last_phys] (i.e., devices).
1955  *      Otherwise like pmap_map.
1956  */
1957 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1958 pmap_map_bd(
1959 	vm_map_address_t virt,
1960 	vm_offset_t start,
1961 	vm_offset_t end,
1962 	vm_prot_t prot)
1963 {
1964 	return pmap_map_bd_with_options(virt, start, end, prot, 0);
1965 }
1966 
1967 /*
1968  *      Back-door routine for mapping kernel VM at initialization.
1969  *      Useful for mapping memory specific physical addresses in early
1970  *      boot (i.e., before kernel_map is initialized).
1971  *
1972  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
1973  */
1974 
1975 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1976 pmap_map_high_window_bd(
1977 	vm_offset_t pa_start,
1978 	vm_size_t len,
1979 	vm_prot_t prot)
1980 {
1981 	pt_entry_t              *ptep, pte;
1982 	vm_map_address_t        va_start = VREGION1_START;
1983 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
1984 	vm_map_address_t        va_end;
1985 	vm_map_address_t        va;
1986 	vm_size_t               offset;
1987 
1988 	offset = pa_start & PAGE_MASK;
1989 	pa_start -= offset;
1990 	len += offset;
1991 
1992 	if (len > (va_max - va_start)) {
1993 		panic("%s: area too large, "
1994 		    "pa_start=%p, len=%p, prot=0x%x",
1995 		    __FUNCTION__,
1996 		    (void*)pa_start, (void*)len, prot);
1997 	}
1998 
1999 scan:
2000 	for (; va_start < va_max; va_start += PAGE_SIZE) {
2001 		ptep = pmap_pte(kernel_pmap, va_start);
2002 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2003 		if (*ptep == ARM_PTE_TYPE_FAULT) {
2004 			break;
2005 		}
2006 	}
2007 	if (va_start > va_max) {
2008 		panic("%s: insufficient pages, "
2009 		    "pa_start=%p, len=%p, prot=0x%x",
2010 		    __FUNCTION__,
2011 		    (void*)pa_start, (void*)len, prot);
2012 	}
2013 
2014 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2015 		ptep = pmap_pte(kernel_pmap, va_end);
2016 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2017 		if (*ptep != ARM_PTE_TYPE_FAULT) {
2018 			va_start = va_end + PAGE_SIZE;
2019 			goto scan;
2020 		}
2021 	}
2022 
2023 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2024 		ptep = pmap_pte(kernel_pmap, va);
2025 		pte = pa_to_pte(pa_start)
2026 		    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2027 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2028 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2029 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2030 #if __ARM_KERNEL_PROTECT__
2031 		pte |= ARM_PTE_NG;
2032 #endif /* __ARM_KERNEL_PROTECT__ */
2033 		write_pte_strong(ptep, pte);
2034 	}
2035 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2036 #if KASAN
2037 	kasan_notify_address(va_start, len);
2038 #endif
2039 	return va_start;
2040 }
2041 
2042 static uint32_t
pmap_compute_max_asids(void)2043 pmap_compute_max_asids(void)
2044 {
2045 	DTEntry entry;
2046 	void const *prop = NULL;
2047 	uint32_t max_asids;
2048 	int err;
2049 	unsigned int prop_size;
2050 
2051 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2052 	assert(err == kSuccess);
2053 
2054 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2055 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
2056 		 * we can choose a more flexible default value here. */
2057 		return MAX_ASIDS;
2058 	}
2059 
2060 	if (prop_size != sizeof(max_asids)) {
2061 		panic("pmap-max-asids property is not a 32-bit integer");
2062 	}
2063 
2064 	max_asids = *((uint32_t const *)prop);
2065 #if HAS_16BIT_ASID
2066 	if (max_asids > MAX_HW_ASIDS) {
2067 		panic("pmap-max-asids 0x%x too large", max_asids);
2068 	}
2069 #else
2070 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2071 	max_asids = (max_asids + 63) & ~63UL;
2072 
2073 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2074 		/* currently capped by size of pmap->sw_asid */
2075 		panic("pmap-max-asids 0x%x too large", max_asids);
2076 	}
2077 #endif /* HAS_16BIT_ASID */
2078 	if (max_asids == 0) {
2079 		panic("pmap-max-asids cannot be zero");
2080 	}
2081 	return max_asids;
2082 }
2083 
2084 #if __arm64__
2085 /*
2086  * pmap_get_arm64_prot
2087  *
2088  * return effective armv8 VMSA block protections including
2089  * table AP/PXN/XN overrides of a pmap entry
2090  *
2091  */
2092 
2093 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2094 pmap_get_arm64_prot(
2095 	pmap_t pmap,
2096 	vm_offset_t addr)
2097 {
2098 	tt_entry_t tte = 0;
2099 	unsigned int level = 0;
2100 	uint64_t tte_type = 0;
2101 	uint64_t effective_prot_bits = 0;
2102 	uint64_t aggregate_tte = 0;
2103 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2104 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2105 
2106 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2107 		tte = *pmap_ttne(pmap, level, addr);
2108 
2109 		if (!(tte & ARM_TTE_VALID)) {
2110 			return 0;
2111 		}
2112 
2113 		tte_type = tte & ARM_TTE_TYPE_MASK;
2114 
2115 		if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2116 		    (level == pt_attr->pta_max_level)) {
2117 			/* Block or page mapping; both have the same protection bit layout. */
2118 			break;
2119 		} else if (tte_type == ARM_TTE_TYPE_TABLE) {
2120 			/* All of the table bits we care about are overrides, so just OR them together. */
2121 			aggregate_tte |= tte;
2122 		}
2123 	}
2124 
2125 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2126 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2127 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2128 
2129 	/* Start with the PTE bits. */
2130 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2131 
2132 	/* Table AP bits mask out block/page AP bits */
2133 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2134 
2135 	/* XN/PXN bits can be OR'd in. */
2136 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2137 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2138 
2139 	return effective_prot_bits;
2140 }
2141 #endif /* __arm64__ */
2142 
2143 /*
2144  *	Bootstrap the system enough to run with virtual memory.
2145  *
2146  *	The early VM initialization code has already allocated
2147  *	the first CPU's translation table and made entries for
2148  *	all the one-to-one mappings to be found there.
2149  *
2150  *	We must set up the kernel pmap structures, the
2151  *	physical-to-virtual translation lookup tables for the
2152  *	physical memory to be managed (between avail_start and
2153  *	avail_end).
2154  *
2155  *	Map the kernel's code and data, and allocate the system page table.
2156  *	Page_size must already be set.
2157  *
2158  *	Parameters:
2159  *	first_avail	first available physical page -
2160  *			   after kernel page tables
2161  *	avail_start	PA of first managed physical page
2162  *	avail_end	PA of last managed physical page
2163  */
2164 
2165 void
pmap_bootstrap(vm_offset_t vstart)2166 pmap_bootstrap(
2167 	vm_offset_t vstart)
2168 {
2169 	vm_map_offset_t maxoffset;
2170 
2171 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2172 
2173 #if XNU_MONITOR
2174 
2175 #if DEVELOPMENT || DEBUG
2176 	PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2177 #endif
2178 
2179 #if CONFIG_CSR_FROM_DT
2180 	if (csr_unsafe_kernel_text) {
2181 		pmap_ppl_disable = true;
2182 	}
2183 #endif /* CONFIG_CSR_FROM_DT */
2184 
2185 #endif /* XNU_MONITOR */
2186 
2187 #if DEVELOPMENT || DEBUG
2188 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2189 		kprintf("Kernel traces for pmap operations enabled\n");
2190 	}
2191 #endif
2192 
2193 	/*
2194 	 *	Initialize the kernel pmap.
2195 	 */
2196 #if ARM_PARAMETERIZED_PMAP
2197 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2198 #endif /* ARM_PARAMETERIZED_PMAP */
2199 #if HAS_APPLE_PAC
2200 	kernel_pmap->disable_jop = 0;
2201 #endif /* HAS_APPLE_PAC */
2202 	kernel_pmap->tte = cpu_tte;
2203 	kernel_pmap->ttep = cpu_ttep;
2204 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2205 	kernel_pmap->max = UINTPTR_MAX;
2206 	os_atomic_init(&kernel_pmap->ref_count, 1);
2207 #if XNU_MONITOR
2208 	os_atomic_init(&kernel_pmap->nested_count, 0);
2209 #endif
2210 	kernel_pmap->nx_enabled = TRUE;
2211 #ifdef  __arm64__
2212 	kernel_pmap->is_64bit = TRUE;
2213 #else
2214 	kernel_pmap->is_64bit = FALSE;
2215 #endif
2216 #if CONFIG_ROSETTA
2217 	kernel_pmap->is_rosetta = FALSE;
2218 #endif
2219 
2220 #if ARM_PARAMETERIZED_PMAP
2221 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2222 #endif /* ARM_PARAMETERIZED_PMAP */
2223 
2224 	kernel_pmap->nested_region_addr = 0x0ULL;
2225 	kernel_pmap->nested_region_size = 0x0ULL;
2226 	kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2227 	kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2228 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2229 
2230 	kernel_pmap->hw_asid = 0;
2231 	kernel_pmap->sw_asid = 0;
2232 
2233 	pmap_lock_init(kernel_pmap);
2234 
2235 	pmap_max_asids = pmap_compute_max_asids();
2236 #if HAS_16BIT_ASID
2237 	asid_chunk_size = MAX_HW_ASIDS;
2238 #else
2239 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2240 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2241 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2242 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2243 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2244 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2245 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2246 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2247 #endif /* HAS_16BIT_ASIDS */
2248 
2249 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2250 
2251 	/**
2252 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2253 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2254 	 * space for these data structures.
2255 	 */
2256 	pmap_data_bootstrap();
2257 
2258 	/**
2259 	 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2260 	 */
2261 	uat_bootstrap();
2262 
2263 
2264 	/**
2265 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2266 	 */
2267 	sart_bootstrap();
2268 
2269 	/**
2270 	 * Don't make any assumptions about the alignment of avail_start before this
2271 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2272 	 */
2273 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2274 
2275 	const pmap_paddr_t pmap_struct_start = avail_start;
2276 
2277 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2278 	avail_start = round_page(avail_start + asid_table_size);
2279 
2280 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2281 
2282 	vm_first_phys = gPhysBase;
2283 	vm_last_phys = trunc_page(avail_end);
2284 
2285 	queue_init(&map_pmap_list);
2286 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2287 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2288 	free_page_size_tt_count = 0;
2289 	free_page_size_tt_max = 0;
2290 	free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2291 	free_two_page_size_tt_count = 0;
2292 	free_two_page_size_tt_max = 0;
2293 	free_tt_list = TT_FREE_ENTRY_NULL;
2294 	free_tt_count = 0;
2295 	free_tt_max = 0;
2296 
2297 	virtual_space_start = vstart;
2298 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2299 
2300 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2301 #if !HAS_16BIT_ASID
2302 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2303 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2304 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2305 #endif /* !HAS_16BIT_ASID */
2306 
2307 
2308 
2309 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2310 		maxoffset = trunc_page(maxoffset);
2311 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2312 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2313 			arm_pmap_max_offset_default = maxoffset;
2314 		}
2315 	}
2316 #if defined(__arm64__)
2317 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2318 		maxoffset = trunc_page(maxoffset);
2319 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2320 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2321 			arm64_pmap_max_offset_default = maxoffset;
2322 		}
2323 	}
2324 #endif
2325 
2326 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2327 
2328 
2329 #if PMAP_CS_PPL_MONITOR
2330 	/* Initialize the PPL trust cache read-write lock */
2331 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2332 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2333 #endif
2334 
2335 #if MACH_ASSERT
2336 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2337 	    &vm_footprint_suspend_allowed,
2338 	    sizeof(vm_footprint_suspend_allowed));
2339 #endif /* MACH_ASSERT */
2340 
2341 #if KASAN
2342 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2343 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2344 #endif /* KASAN */
2345 
2346 	/**
2347 	 * Ensure that avail_start is always left on a page boundary. The calling
2348 	 * code might not perform any alignment before allocating page tables so
2349 	 * this is important.
2350 	 */
2351 	avail_start = round_page(avail_start);
2352 }
2353 
2354 #if XNU_MONITOR
2355 
2356 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2357 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2358 {
2359 	pmap_paddr_t cur_pa;
2360 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2361 		assert(pa_valid(cur_pa));
2362 		ppattr_pa_set_monitor(cur_pa);
2363 	}
2364 }
2365 
2366 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2367 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2368     pmap_paddr_t end_pa,
2369     unsigned int expected_perm,
2370     unsigned int new_perm)
2371 {
2372 	vm_offset_t start_va = phystokv(start_pa);
2373 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2374 
2375 	pa_set_range_monitor(start_pa, end_pa);
2376 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2377 }
2378 
2379 static void
pmap_lockdown_kc(void)2380 pmap_lockdown_kc(void)
2381 {
2382 	extern vm_offset_t vm_kernelcache_base;
2383 	extern vm_offset_t vm_kernelcache_top;
2384 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2385 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2386 	pmap_paddr_t cur_pa = start_pa;
2387 	vm_offset_t cur_va = vm_kernelcache_base;
2388 	while (cur_pa < end_pa) {
2389 		vm_size_t range_size = end_pa - cur_pa;
2390 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2391 		if (ptov_va != cur_va) {
2392 			/*
2393 			 * If the physical address maps back to a virtual address that is non-linear
2394 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2395 			 * reclaimed by the OS and should therefore not be locked down.
2396 			 */
2397 			cur_pa += range_size;
2398 			cur_va += range_size;
2399 			continue;
2400 		}
2401 		unsigned int pai = pa_index(cur_pa);
2402 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2403 
2404 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2405 
2406 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2407 			panic("pai %d already locked down", pai);
2408 		}
2409 
2410 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2411 		cur_pa += ARM_PGBYTES;
2412 		cur_va += ARM_PGBYTES;
2413 	}
2414 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2415 	extern uint64_t ctrr_ro_test;
2416 	extern uint64_t ctrr_nx_test;
2417 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2418 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2419 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2420 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2421 	}
2422 #endif
2423 }
2424 
2425 void
pmap_static_allocations_done(void)2426 pmap_static_allocations_done(void)
2427 {
2428 	pmap_paddr_t monitor_start_pa;
2429 	pmap_paddr_t monitor_end_pa;
2430 
2431 	/*
2432 	 * Protect the bootstrap (V=P and V->P) page tables.
2433 	 *
2434 	 * These bootstrap allocations will be used primarily for page tables.
2435 	 * If we wish to secure the page tables, we need to start by marking
2436 	 * these bootstrap allocations as pages that we want to protect.
2437 	 */
2438 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2439 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2440 
2441 	/* The bootstrap page tables are mapped RW at boostrap. */
2442 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2443 
2444 	/*
2445 	 * We use avail_start as a pointer to the first address that has not
2446 	 * been reserved for bootstrap, so we know which pages to give to the
2447 	 * virtual memory layer.
2448 	 */
2449 	monitor_start_pa = first_avail_phys;
2450 	monitor_end_pa = avail_start;
2451 
2452 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2453 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2454 
2455 	/*
2456 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2457 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2458 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2459 	 * they can't be allocated for other uses.  We don't need a special xPRR
2460 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2461 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2462 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2463 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2464 	 * to believe we are dealing with an user XO page upon performing a translation.
2465 	 */
2466 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2467 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2468 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2469 
2470 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2471 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2472 
2473 	/* PPL data is RW for the PPL, RO for the kernel. */
2474 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2475 
2476 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2477 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2478 
2479 	/* PPL text is RX for the PPL, RO for the kernel. */
2480 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2481 
2482 
2483 	/*
2484 	 * In order to support DTrace, the save areas for the PPL must be
2485 	 * writable.  This is due to the fact that DTrace will try to update
2486 	 * register state.
2487 	 */
2488 	if (pmap_ppl_disable) {
2489 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2490 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2491 
2492 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2493 	}
2494 
2495 
2496 	if (segSizePPLDATACONST > 0) {
2497 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2498 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2499 
2500 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2501 	}
2502 
2503 	/*
2504 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2505 	 * precaution.  The real RW mappings are at a different location with guard pages.
2506 	 */
2507 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2508 
2509 	/* Prevent remapping of the kernelcache */
2510 	pmap_lockdown_kc();
2511 }
2512 
2513 void
pmap_lockdown_ppl(void)2514 pmap_lockdown_ppl(void)
2515 {
2516 	/* Mark the PPL as being locked down. */
2517 
2518 	mp_disable_preemption(); // for _nopreempt locking operations
2519 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2520 	if (commpage_text_kva != 0) {
2521 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2522 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2523 	}
2524 	mp_enable_preemption();
2525 
2526 	/* Write-protect the kernel RO commpage. */
2527 #error "XPRR configuration error"
2528 }
2529 #endif /* XNU_MONITOR */
2530 
2531 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2532 pmap_virtual_space(
2533 	vm_offset_t *startp,
2534 	vm_offset_t *endp
2535 	)
2536 {
2537 	*startp = virtual_space_start;
2538 	*endp = virtual_space_end;
2539 }
2540 
2541 
2542 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2543 pmap_virtual_region(
2544 	unsigned int region_select,
2545 	vm_map_offset_t *startp,
2546 	vm_map_size_t *size
2547 	)
2548 {
2549 	boolean_t       ret = FALSE;
2550 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2551 	if (region_select == 0) {
2552 		/*
2553 		 * In this config, the bootstrap mappings should occupy their own L2
2554 		 * TTs, as they should be immutable after boot.  Having the associated
2555 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2556 		 * while allowing the rest of the kernel address range to be remapped.
2557 		 */
2558 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2559 #if defined(ARM_LARGE_MEMORY)
2560 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2561 #else
2562 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2563 #endif
2564 		ret = TRUE;
2565 	}
2566 
2567 #if defined(ARM_LARGE_MEMORY)
2568 	if (region_select == 1) {
2569 		*startp = VREGION1_START;
2570 		*size = VREGION1_SIZE;
2571 		ret = TRUE;
2572 	}
2573 #endif
2574 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2575 #if defined(ARM_LARGE_MEMORY)
2576 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2577 	if (region_select == 0) {
2578 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2579 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2580 		ret = TRUE;
2581 	}
2582 
2583 	if (region_select == 1) {
2584 		*startp = VREGION1_START;
2585 		*size = VREGION1_SIZE;
2586 		ret = TRUE;
2587 	}
2588 #else /* !defined(ARM_LARGE_MEMORY) */
2589 	unsigned long low_global_vr_mask = 0;
2590 	vm_map_size_t low_global_vr_size = 0;
2591 
2592 	if (region_select == 0) {
2593 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2594 		if (!TEST_PAGE_SIZE_4K) {
2595 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2596 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2597 		} else {
2598 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2599 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2600 		}
2601 		ret = TRUE;
2602 	}
2603 	if (region_select == 1) {
2604 		*startp = VREGION1_START;
2605 		*size = VREGION1_SIZE;
2606 		ret = TRUE;
2607 	}
2608 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2609 	if (!TEST_PAGE_SIZE_4K) {
2610 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2611 		low_global_vr_size = 0x2000000;
2612 	} else {
2613 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2614 		low_global_vr_size = 0x800000;
2615 	}
2616 
2617 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2618 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2619 		*size = low_global_vr_size;
2620 		ret = TRUE;
2621 	}
2622 
2623 	if (region_select == 3) {
2624 		/* In this config, we allow the bootstrap mappings to occupy the same
2625 		 * page table pages as the heap.
2626 		 */
2627 		*startp = VM_MIN_KERNEL_ADDRESS;
2628 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2629 		ret = TRUE;
2630 	}
2631 #endif /* defined(ARM_LARGE_MEMORY) */
2632 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2633 	return ret;
2634 }
2635 
2636 /*
2637  * Routines to track and allocate physical pages during early boot.
2638  * On most systems that memory runs from first_avail through to avail_end
2639  * with no gaps.
2640  *
2641  * If the system supports ECC and ecc_bad_pages_count > 0, we
2642  * need to skip those pages.
2643  */
2644 
2645 static unsigned int avail_page_count = 0;
2646 static bool need_ram_ranges_init = true;
2647 
2648 
2649 /**
2650  * Checks to see if a given page is in
2651  * the array of known bad pages
2652  *
2653  * @param ppn page number to check
2654  */
2655 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2656 pmap_is_bad_ram(__unused ppnum_t ppn)
2657 {
2658 	return false;
2659 }
2660 
2661 /**
2662  * Prepare bad ram pages to be skipped.
2663  */
2664 
2665 /*
2666  * Initialize the count of available pages. No lock needed here,
2667  * as this code is called while kernel boot up is single threaded.
2668  */
2669 static void
initialize_ram_ranges(void)2670 initialize_ram_ranges(void)
2671 {
2672 	pmap_paddr_t first = first_avail;
2673 	pmap_paddr_t end = avail_end;
2674 
2675 	assert(first <= end);
2676 	assert(first == (first & ~PAGE_MASK));
2677 	assert(end == (end & ~PAGE_MASK));
2678 	avail_page_count = atop(end - first);
2679 
2680 	need_ram_ranges_init = false;
2681 }
2682 
2683 unsigned int
pmap_free_pages(void)2684 pmap_free_pages(
2685 	void)
2686 {
2687 	if (need_ram_ranges_init) {
2688 		initialize_ram_ranges();
2689 	}
2690 	return avail_page_count;
2691 }
2692 
2693 unsigned int
pmap_free_pages_span(void)2694 pmap_free_pages_span(
2695 	void)
2696 {
2697 	if (need_ram_ranges_init) {
2698 		initialize_ram_ranges();
2699 	}
2700 	return (unsigned int)atop(avail_end - first_avail);
2701 }
2702 
2703 
2704 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2705 pmap_next_page_hi(
2706 	ppnum_t            * pnum,
2707 	__unused boolean_t might_free)
2708 {
2709 	return pmap_next_page(pnum);
2710 }
2711 
2712 
2713 boolean_t
pmap_next_page(ppnum_t * pnum)2714 pmap_next_page(
2715 	ppnum_t *pnum)
2716 {
2717 	if (need_ram_ranges_init) {
2718 		initialize_ram_ranges();
2719 	}
2720 
2721 
2722 	if (first_avail != avail_end) {
2723 		*pnum = (ppnum_t)atop(first_avail);
2724 		first_avail += PAGE_SIZE;
2725 		assert(avail_page_count > 0);
2726 		--avail_page_count;
2727 		return TRUE;
2728 	}
2729 	assert(avail_page_count == 0);
2730 	return FALSE;
2731 }
2732 
2733 
2734 /*
2735  *	Initialize the pmap module.
2736  *	Called by vm_init, to initialize any structures that the pmap
2737  *	system needs to map virtual memory.
2738  */
2739 void
pmap_init(void)2740 pmap_init(
2741 	void)
2742 {
2743 	/*
2744 	 *	Protect page zero in the kernel map.
2745 	 *	(can be overruled by permanent transltion
2746 	 *	table entries at page zero - see arm_vm_init).
2747 	 */
2748 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2749 
2750 	pmap_initialized = TRUE;
2751 
2752 	/*
2753 	 *	Create the zone of physical maps
2754 	 *	and the physical-to-virtual entries.
2755 	 */
2756 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2757 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2758 
2759 
2760 	/*
2761 	 *	Initialize the pmap object (for tracking the vm_page_t
2762 	 *	structures for pages we allocate to be page tables in
2763 	 *	pmap_expand().
2764 	 */
2765 	_vm_object_allocate(mem_size, pmap_object);
2766 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2767 
2768 	/*
2769 	 * The values of [hard_]maxproc may have been scaled, make sure
2770 	 * they are still less than the value of pmap_max_asids.
2771 	 */
2772 	if ((uint32_t)maxproc > pmap_max_asids) {
2773 		maxproc = pmap_max_asids;
2774 	}
2775 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2776 		hard_maxproc = pmap_max_asids;
2777 	}
2778 }
2779 
2780 /**
2781  * Verify that a given physical page contains no mappings (outside of the
2782  * default physical aperture mapping).
2783  *
2784  * @param ppnum Physical page number to check there are no mappings to.
2785  *
2786  * @return True if there are no mappings, false otherwise or if the page is not
2787  *         kernel-managed.
2788  */
2789 bool
pmap_verify_free(ppnum_t ppnum)2790 pmap_verify_free(ppnum_t ppnum)
2791 {
2792 	const pmap_paddr_t pa = ptoa(ppnum);
2793 
2794 	assert(pa != vm_page_fictitious_addr);
2795 
2796 	/* Only mappings to kernel-managed physical memory are tracked. */
2797 	if (!pa_valid(pa)) {
2798 		return false;
2799 	}
2800 
2801 	const unsigned int pai = pa_index(pa);
2802 	pv_entry_t **pvh = pai_to_pvh(pai);
2803 
2804 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2805 }
2806 
2807 #if MACH_ASSERT
2808 /**
2809  * Verify that a given physical page contains no mappings (outside of the
2810  * default physical aperture mapping) and if it does, then panic.
2811  *
2812  * @note It's recommended to use pmap_verify_free() directly when operating in
2813  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2814  *       normally being called from outside of the PPL, and the pv_head_table
2815  *       can't be modified outside of the PPL).
2816  *
2817  * @param ppnum Physical page number to check there are no mappings to.
2818  */
2819 void
pmap_assert_free(ppnum_t ppnum)2820 pmap_assert_free(ppnum_t ppnum)
2821 {
2822 	const pmap_paddr_t pa = ptoa(ppnum);
2823 
2824 	/* Only mappings to kernel-managed physical memory are tracked. */
2825 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2826 		return;
2827 	}
2828 
2829 	const unsigned int pai = pa_index(pa);
2830 	pv_entry_t **pvh = pai_to_pvh(pai);
2831 
2832 	/**
2833 	 * This function is always called from outside of the PPL. Because of this,
2834 	 * the PVH entry can't be locked. This function is generally only called
2835 	 * before the VM reclaims a physical page and shouldn't be creating new
2836 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2837 	 * the worst case is that the system will panic in another way, and we were
2838 	 * already about to panic anyway.
2839 	 */
2840 
2841 	/**
2842 	 * Since pmap_verify_free() returned false, that means there is at least one
2843 	 * mapping left. Let's get some extra info on the first mapping we find to
2844 	 * dump in the panic string (the common case is that there is one spare
2845 	 * mapping that was never unmapped).
2846 	 */
2847 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2848 
2849 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2850 		first_ptep = pvh_ptep(pvh);
2851 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2852 		pv_entry_t *pvep = pvh_pve_list(pvh);
2853 
2854 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2855 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2856 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2857 			if (first_ptep != PT_ENTRY_NULL) {
2858 				break;
2859 			}
2860 		}
2861 
2862 		/* The PVE should have at least one valid PTE. */
2863 		assert(first_ptep != PT_ENTRY_NULL);
2864 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2865 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2866 		    __func__, pvh, pai);
2867 	} else {
2868 		/**
2869 		 * The mapping disappeared between here and the pmap_verify_free() call.
2870 		 * The only way that can happen is if the VM was racing this call with
2871 		 * a call that unmaps PTEs. Operations on this page should not be
2872 		 * occurring at the same time as this check, and unfortunately we can't
2873 		 * lock the PVH entry to prevent it, so just panic instead.
2874 		 */
2875 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2876 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2877 		    __func__, pvh, pai);
2878 	}
2879 
2880 	/* Panic with a unique string identifying the first bad mapping and owner. */
2881 	{
2882 		/* First PTE is mapped by the main CPUs. */
2883 		pmap_t pmap = ptep_get_pmap(first_ptep);
2884 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2885 
2886 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2887 		    "%s CPU mapping (pmap: %p)",
2888 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2889 	}
2890 }
2891 #endif
2892 
2893 
2894 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2895 pmap_root_alloc_size(pmap_t pmap)
2896 {
2897 #pragma unused(pmap)
2898 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2899 	unsigned int root_level = pt_attr_root_level(pt_attr);
2900 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2901 }
2902 
2903 
2904 /*
2905  *	Create and return a physical map.
2906  *
2907  *	If the size specified for the map
2908  *	is zero, the map is an actual physical
2909  *	map, and may be referenced by the
2910  *	hardware.
2911  *
2912  *	If the size specified is non-zero,
2913  *	the map will be used in software only, and
2914  *	is bounded by that size.
2915  */
2916 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2917 pmap_create_options_internal(
2918 	ledger_t ledger,
2919 	vm_map_size_t size,
2920 	unsigned int flags,
2921 	kern_return_t *kr)
2922 {
2923 	unsigned        i;
2924 	unsigned        tte_index_max;
2925 	pmap_t          p;
2926 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2927 #if defined(HAS_APPLE_PAC)
2928 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2929 #endif /* defined(HAS_APPLE_PAC) */
2930 	kern_return_t   local_kr = KERN_SUCCESS;
2931 
2932 	if (size != 0) {
2933 		{
2934 			// Size parameter should only be set for stage 2.
2935 			return PMAP_NULL;
2936 		}
2937 	}
2938 
2939 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2940 		return PMAP_NULL;
2941 	}
2942 
2943 #if XNU_MONITOR
2944 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2945 		goto pmap_create_fail;
2946 	}
2947 
2948 	assert(p != PMAP_NULL);
2949 
2950 	if (ledger) {
2951 		pmap_ledger_validate(ledger);
2952 		pmap_ledger_retain(ledger);
2953 	}
2954 #else
2955 	/*
2956 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
2957 	 *	the translation table of the right size for the pmap.
2958 	 */
2959 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2960 		local_kr = KERN_RESOURCE_SHORTAGE;
2961 		goto pmap_create_fail;
2962 	}
2963 #endif
2964 
2965 	p->ledger = ledger;
2966 
2967 
2968 	p->pmap_vm_map_cs_enforced = false;
2969 
2970 
2971 #if CONFIG_ROSETTA
2972 	if (flags & PMAP_CREATE_ROSETTA) {
2973 		p->is_rosetta = TRUE;
2974 	} else {
2975 		p->is_rosetta = FALSE;
2976 	}
2977 #endif /* CONFIG_ROSETTA */
2978 
2979 #if defined(HAS_APPLE_PAC)
2980 	p->disable_jop = disable_jop;
2981 #endif /* defined(HAS_APPLE_PAC) */
2982 
2983 	p->nested_region_true_start = 0;
2984 	p->nested_region_true_end = ~0;
2985 
2986 	p->nx_enabled = true;
2987 	p->is_64bit = is_64bit;
2988 	p->nested_pmap = PMAP_NULL;
2989 	p->type = PMAP_TYPE_USER;
2990 
2991 #if ARM_PARAMETERIZED_PMAP
2992 	/* Default to the native pt_attr */
2993 	p->pmap_pt_attr = native_pt_attr;
2994 #endif /* ARM_PARAMETERIZED_PMAP */
2995 #if __ARM_MIXED_PAGE_SIZE__
2996 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2997 		p->pmap_pt_attr = &pmap_pt_attr_4k;
2998 	}
2999 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3000 	p->max = pmap_user_va_size(p);
3001 	/* Don't allow mapping the first page (i.e. NULL or near-NULL). */
3002 	p->min = pt_attr_page_size(pmap_get_pt_attr(p));
3003 
3004 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3005 		local_kr = KERN_NO_SPACE;
3006 		goto id_alloc_fail;
3007 	}
3008 
3009 	pmap_lock_init(p);
3010 
3011 	p->tt_entry_free = (tt_entry_t *)0;
3012 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3013 
3014 
3015 #if XNU_MONITOR
3016 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3017 #else
3018 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3019 #endif
3020 	if (!(p->tte)) {
3021 		local_kr = KERN_RESOURCE_SHORTAGE;
3022 		goto tt1_alloc_fail;
3023 	}
3024 
3025 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3026 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3027 
3028 	/* nullify the translation table */
3029 	for (i = 0; i < tte_index_max; i++) {
3030 		p->tte[i] = ARM_TTE_TYPE_FAULT;
3031 	}
3032 
3033 	FLUSH_PTE();
3034 
3035 	/*
3036 	 *  initialize the rest of the structure
3037 	 */
3038 	p->nested_region_addr = 0x0ULL;
3039 	p->nested_region_size = 0x0ULL;
3040 	p->nested_region_unnested_table_bitmap = NULL;
3041 	p->nested_region_unnested_table_bitmap_size = 0x0UL;
3042 
3043 	p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3044 	p->nested_no_bounds_refcnt = 0;
3045 	p->nested_bounds_set = false;
3046 
3047 
3048 #if MACH_ASSERT
3049 	p->pmap_pid = 0;
3050 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3051 #endif /* MACH_ASSERT */
3052 #if DEVELOPMENT || DEBUG
3053 	p->footprint_was_suspended = FALSE;
3054 #endif /* DEVELOPMENT || DEBUG */
3055 
3056 #if XNU_MONITOR
3057 	os_atomic_init(&p->nested_count, 0);
3058 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3059 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3060 	os_atomic_thread_fence(release);
3061 #endif
3062 	os_atomic_init(&p->ref_count, 1);
3063 	pmap_simple_lock(&pmaps_lock);
3064 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3065 	pmap_simple_unlock(&pmaps_lock);
3066 
3067 	/*
3068 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3069 	 * which can lead to a concurrent disconnect operation making the balance
3070 	 * transiently negative.  The ledger should still ultimately balance out,
3071 	 * which we still check upon pmap destruction.
3072 	 */
3073 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3074 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3075 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3076 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3077 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3078 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3079 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3080 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3081 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3082 
3083 	return p;
3084 
3085 tt1_alloc_fail:
3086 	pmap_get_pt_ops(p)->free_id(p);
3087 id_alloc_fail:
3088 #if XNU_MONITOR
3089 	pmap_free_pmap(p);
3090 
3091 	if (ledger) {
3092 		pmap_ledger_release(ledger);
3093 	}
3094 #else
3095 	zfree(pmap_zone, p);
3096 #endif
3097 pmap_create_fail:
3098 #if XNU_MONITOR
3099 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3100 #endif
3101 	*kr = local_kr;
3102 #if XNU_MONITOR
3103 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3104 #endif
3105 	return PMAP_NULL;
3106 }
3107 
3108 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3109 pmap_create_options(
3110 	ledger_t ledger,
3111 	vm_map_size_t size,
3112 	unsigned int flags)
3113 {
3114 	pmap_t pmap;
3115 	kern_return_t kr = KERN_SUCCESS;
3116 
3117 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3118 
3119 	ledger_reference(ledger);
3120 
3121 #if XNU_MONITOR
3122 	for (;;) {
3123 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3124 		if (kr != KERN_RESOURCE_SHORTAGE) {
3125 			break;
3126 		}
3127 		assert(pmap == PMAP_NULL);
3128 		pmap_alloc_page_for_ppl(0);
3129 		kr = KERN_SUCCESS;
3130 	}
3131 #else
3132 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3133 #endif
3134 
3135 	if (pmap == PMAP_NULL) {
3136 		ledger_dereference(ledger);
3137 	}
3138 
3139 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3140 
3141 	return pmap;
3142 }
3143 
3144 #if XNU_MONITOR
3145 /*
3146  * This symbol remains in place when the PPL is enabled so that the dispatch
3147  * table does not change from development to release configurations.
3148  */
3149 #endif
3150 #if MACH_ASSERT || XNU_MONITOR
3151 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3152 pmap_set_process_internal(
3153 	__unused pmap_t pmap,
3154 	__unused int pid,
3155 	__unused char *procname)
3156 {
3157 #if MACH_ASSERT
3158 	if (pmap == NULL || pmap->pmap_pid == -1) {
3159 		return;
3160 	}
3161 
3162 	validate_pmap_mutable(pmap);
3163 
3164 	pmap->pmap_pid = pid;
3165 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3166 #endif /* MACH_ASSERT */
3167 }
3168 #endif /* MACH_ASSERT || XNU_MONITOR */
3169 
3170 #if MACH_ASSERT
3171 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3172 pmap_set_process(
3173 	pmap_t pmap,
3174 	int pid,
3175 	char *procname)
3176 {
3177 #if XNU_MONITOR
3178 	pmap_set_process_ppl(pmap, pid, procname);
3179 #else
3180 	pmap_set_process_internal(pmap, pid, procname);
3181 #endif
3182 }
3183 #endif /* MACH_ASSERT */
3184 
3185 /*
3186  * pmap_deallocate_all_leaf_tts:
3187  *
3188  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3189  * removing and deallocating all TTEs.
3190  */
3191 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3192 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3193 {
3194 	tt_entry_t tte = ARM_TTE_EMPTY;
3195 	tt_entry_t * ttep = NULL;
3196 	tt_entry_t * last_ttep = NULL;
3197 
3198 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3199 
3200 	assert(level < pt_attr_leaf_level(pt_attr));
3201 
3202 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3203 
3204 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3205 		tte = *ttep;
3206 
3207 		if (!(tte & ARM_TTE_VALID)) {
3208 			continue;
3209 		}
3210 
3211 		if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3212 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3213 			    "pmap=%p, first_ttep=%p, level=%u",
3214 			    __FUNCTION__, ttep, (void *)tte,
3215 			    pmap, first_ttep, level);
3216 		}
3217 
3218 		/* Must be valid, type table */
3219 		if (level < pt_attr_twig_level(pt_attr)) {
3220 			/* If we haven't reached the twig level, recurse to the next level. */
3221 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3222 		}
3223 
3224 		/* Remove the TTE. */
3225 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3226 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3227 	}
3228 }
3229 
3230 /*
3231  * We maintain stats and ledgers so that a task's physical footprint is:
3232  * phys_footprint = ((internal - alternate_accounting)
3233  *                   + (internal_compressed - alternate_accounting_compressed)
3234  *                   + iokit_mapped
3235  *                   + purgeable_nonvolatile
3236  *                   + purgeable_nonvolatile_compressed
3237  *                   + page_table)
3238  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3239  */
3240 
3241 /*
3242  *	Retire the given physical map from service.
3243  *	Should only be called if the map contains
3244  *	no valid mappings.
3245  */
3246 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3247 pmap_destroy_internal(
3248 	pmap_t pmap)
3249 {
3250 	if (pmap == PMAP_NULL) {
3251 		return;
3252 	}
3253 
3254 	validate_pmap(pmap);
3255 
3256 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3257 
3258 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3259 	if (ref_count > 0) {
3260 		return;
3261 	} else if (__improbable(ref_count < 0)) {
3262 		panic("pmap %p: refcount underflow", pmap);
3263 	} else if (__improbable(pmap == kernel_pmap)) {
3264 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3265 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3266 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3267 	}
3268 
3269 #if XNU_MONITOR
3270 	/*
3271 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3272 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3273 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3274 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3275 	 * ref_count of 0 and panic.
3276 	 */
3277 	os_atomic_thread_fence(seq_cst);
3278 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3279 		panic("pmap %p: attempt to destroy while nested", pmap);
3280 	}
3281 	const int max_cpu = ml_get_max_cpu_number();
3282 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3283 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3284 		if (cpu_data == NULL) {
3285 			continue;
3286 		}
3287 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3288 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3289 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3290 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3291 		}
3292 	}
3293 #endif
3294 	pmap_unmap_commpage(pmap);
3295 
3296 	pmap_simple_lock(&pmaps_lock);
3297 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3298 	pmap_simple_unlock(&pmaps_lock);
3299 
3300 	pmap_trim_self(pmap);
3301 
3302 	/*
3303 	 *	Free the memory maps, then the
3304 	 *	pmap structure.
3305 	 */
3306 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3307 
3308 
3309 
3310 	if (pmap->tte) {
3311 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3312 		pmap->tte = (tt_entry_t *) NULL;
3313 		pmap->ttep = 0;
3314 	}
3315 
3316 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3317 
3318 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3319 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3320 		sync_tlb_flush();
3321 	} else {
3322 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3323 		sync_tlb_flush();
3324 		/* return its asid to the pool */
3325 		pmap_get_pt_ops(pmap)->free_id(pmap);
3326 		if (pmap->nested_pmap != NULL) {
3327 #if XNU_MONITOR
3328 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3329 #endif
3330 			/* release the reference we hold on the nested pmap */
3331 			pmap_destroy_internal(pmap->nested_pmap);
3332 		}
3333 	}
3334 
3335 	pmap_check_ledgers(pmap);
3336 
3337 	if (pmap->nested_region_unnested_table_bitmap) {
3338 #if XNU_MONITOR
3339 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3340 #else
3341 		kfree_data(pmap->nested_region_unnested_table_bitmap,
3342 		    pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3343 #endif
3344 	}
3345 
3346 #if XNU_MONITOR
3347 	if (pmap->ledger) {
3348 		pmap_ledger_release(pmap->ledger);
3349 	}
3350 
3351 	pmap_lock_destroy(pmap);
3352 	pmap_free_pmap(pmap);
3353 #else
3354 	pmap_lock_destroy(pmap);
3355 	zfree(pmap_zone, pmap);
3356 #endif
3357 }
3358 
3359 void
pmap_destroy(pmap_t pmap)3360 pmap_destroy(
3361 	pmap_t pmap)
3362 {
3363 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3364 
3365 	ledger_t ledger = pmap->ledger;
3366 
3367 #if XNU_MONITOR
3368 	pmap_destroy_ppl(pmap);
3369 
3370 	pmap_ledger_check_balance(pmap);
3371 #else
3372 	pmap_destroy_internal(pmap);
3373 #endif
3374 
3375 	ledger_dereference(ledger);
3376 
3377 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3378 }
3379 
3380 
3381 /*
3382  *	Add a reference to the specified pmap.
3383  */
3384 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3385 pmap_reference_internal(
3386 	pmap_t pmap)
3387 {
3388 	if (pmap != PMAP_NULL) {
3389 		validate_pmap_mutable(pmap);
3390 		os_atomic_inc(&pmap->ref_count, relaxed);
3391 	}
3392 }
3393 
3394 void
pmap_reference(pmap_t pmap)3395 pmap_reference(
3396 	pmap_t pmap)
3397 {
3398 #if XNU_MONITOR
3399 	pmap_reference_ppl(pmap);
3400 #else
3401 	pmap_reference_internal(pmap);
3402 #endif
3403 }
3404 
3405 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3406 pmap_tt1_allocate(
3407 	pmap_t          pmap,
3408 	vm_size_t       size,
3409 	unsigned        option)
3410 {
3411 	tt_entry_t      *tt1 = NULL;
3412 	tt_free_entry_t *tt1_free;
3413 	pmap_paddr_t    pa;
3414 	vm_address_t    va;
3415 	vm_address_t    va_end;
3416 	kern_return_t   ret;
3417 
3418 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3419 		size = PAGE_SIZE;
3420 	}
3421 
3422 	pmap_simple_lock(&tt1_lock);
3423 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3424 		free_page_size_tt_count--;
3425 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3426 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3427 	} else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3428 		free_two_page_size_tt_count--;
3429 		tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3430 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3431 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3432 		free_tt_count--;
3433 		tt1 = (tt_entry_t *)free_tt_list;
3434 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3435 	}
3436 
3437 	pmap_simple_unlock(&tt1_lock);
3438 
3439 	if (tt1 != NULL) {
3440 		pmap_tt_ledger_credit(pmap, size);
3441 		return (tt_entry_t *)tt1;
3442 	}
3443 
3444 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3445 
3446 	if (ret == KERN_RESOURCE_SHORTAGE) {
3447 		return (tt_entry_t *)0;
3448 	}
3449 
3450 #if XNU_MONITOR
3451 	assert(pa);
3452 #endif
3453 
3454 	if (size < PAGE_SIZE) {
3455 		va = phystokv(pa) + size;
3456 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3457 		tt_free_entry_t *next_free = NULL;
3458 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3459 			tt1_free = (tt_free_entry_t *)va;
3460 			tt1_free->next = next_free;
3461 			next_free = tt1_free;
3462 		}
3463 		pmap_simple_lock(&tt1_lock);
3464 		local_free_list->next = free_tt_list;
3465 		free_tt_list = next_free;
3466 		free_tt_count += ((PAGE_SIZE / size) - 1);
3467 		if (free_tt_count > free_tt_max) {
3468 			free_tt_max = free_tt_count;
3469 		}
3470 		pmap_simple_unlock(&tt1_lock);
3471 	}
3472 
3473 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3474 	 * Depending on the device, this can vary between 512b and 16K. */
3475 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3476 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3477 	pmap_tt_ledger_credit(pmap, size);
3478 
3479 	return (tt_entry_t *) phystokv(pa);
3480 }
3481 
3482 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3483 pmap_tt1_deallocate(
3484 	pmap_t pmap,
3485 	tt_entry_t *tt,
3486 	vm_size_t size,
3487 	unsigned option)
3488 {
3489 	tt_free_entry_t *tt_entry;
3490 
3491 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3492 		size = PAGE_SIZE;
3493 	}
3494 
3495 	tt_entry = (tt_free_entry_t *)tt;
3496 	assert(not_in_kdp);
3497 	pmap_simple_lock(&tt1_lock);
3498 
3499 	if (size < PAGE_SIZE) {
3500 		free_tt_count++;
3501 		if (free_tt_count > free_tt_max) {
3502 			free_tt_max = free_tt_count;
3503 		}
3504 		tt_entry->next = free_tt_list;
3505 		free_tt_list = tt_entry;
3506 	}
3507 
3508 	if (size == PAGE_SIZE) {
3509 		free_page_size_tt_count++;
3510 		if (free_page_size_tt_count > free_page_size_tt_max) {
3511 			free_page_size_tt_max = free_page_size_tt_count;
3512 		}
3513 		tt_entry->next = free_page_size_tt_list;
3514 		free_page_size_tt_list = tt_entry;
3515 	}
3516 
3517 	if (size == 2 * PAGE_SIZE) {
3518 		free_two_page_size_tt_count++;
3519 		if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3520 			free_two_page_size_tt_max = free_two_page_size_tt_count;
3521 		}
3522 		tt_entry->next = free_two_page_size_tt_list;
3523 		free_two_page_size_tt_list = tt_entry;
3524 	}
3525 
3526 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3527 		pmap_simple_unlock(&tt1_lock);
3528 		pmap_tt_ledger_debit(pmap, size);
3529 		return;
3530 	}
3531 
3532 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3533 		free_page_size_tt_count--;
3534 		tt = (tt_entry_t *)free_page_size_tt_list;
3535 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3536 
3537 		pmap_simple_unlock(&tt1_lock);
3538 
3539 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3540 
3541 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3542 
3543 		pmap_simple_lock(&tt1_lock);
3544 	}
3545 
3546 	while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3547 		free_two_page_size_tt_count--;
3548 		tt = (tt_entry_t *)free_two_page_size_tt_list;
3549 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3550 
3551 		pmap_simple_unlock(&tt1_lock);
3552 
3553 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3554 
3555 		OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3556 
3557 		pmap_simple_lock(&tt1_lock);
3558 	}
3559 	pmap_simple_unlock(&tt1_lock);
3560 	pmap_tt_ledger_debit(pmap, size);
3561 }
3562 
3563 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3564 pmap_tt_allocate(
3565 	pmap_t pmap,
3566 	tt_entry_t **ttp,
3567 	unsigned int level,
3568 	unsigned int options)
3569 {
3570 	pmap_paddr_t pa;
3571 	*ttp = NULL;
3572 
3573 	/* Traverse the tt_entry_free list to find a free tt_entry */
3574 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3575 		return KERN_ABORTED;
3576 	}
3577 
3578 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3579 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3580 
3581 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3582 		tt_free_next = tt_free_cur->next;
3583 		tt_free_cur->next = NULL;
3584 		*ttp = (tt_entry_t *)tt_free_cur;
3585 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3586 	}
3587 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3588 
3589 	/* Only do the heavylifting here when we don't have a free tt_entry. */
3590 	if (*ttp == NULL) {
3591 		pt_desc_t       *ptdp;
3592 
3593 		/*
3594 		 *  Allocate a VM page for the level x page table entries.
3595 		 */
3596 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3597 			if (options & PMAP_OPTIONS_NOWAIT) {
3598 				return KERN_RESOURCE_SHORTAGE;
3599 			}
3600 			VM_PAGE_WAIT();
3601 		}
3602 
3603 		/* Allocate a new Page Table Descriptor for the newly allocated page table. */
3604 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3605 			if (options & PMAP_OPTIONS_NOWAIT) {
3606 				/* Deallocate all allocated resources so far. */
3607 				pmap_pages_free(pa, PAGE_SIZE);
3608 				return KERN_RESOURCE_SHORTAGE;
3609 			}
3610 			VM_PAGE_WAIT();
3611 		}
3612 
3613 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3614 			OSAddAtomic64(1, &alloc_ttepages_count);
3615 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3616 		} else {
3617 			OSAddAtomic64(1, &alloc_ptepages_count);
3618 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3619 		}
3620 
3621 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3622 
3623 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3624 
3625 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3626 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3627 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3628 
3629 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3630 		if (PAGE_SIZE > pmap_page_size) {
3631 			vm_address_t    va;
3632 			vm_address_t    va_end;
3633 
3634 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3635 				/* Deallocate all allocated resources so far. */
3636 				pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3637 				PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3638 				pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3639 				pmap_pages_free(pa, PAGE_SIZE);
3640 				ptd_deallocate(ptdp);
3641 
3642 				return KERN_ABORTED;
3643 			}
3644 
3645 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3646 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3647 				pmap->tt_entry_free = (tt_entry_t *)va;
3648 			}
3649 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3650 		}
3651 
3652 		*ttp = (tt_entry_t *)phystokv(pa);
3653 	}
3654 
3655 #if XNU_MONITOR
3656 	assert(*ttp);
3657 #endif
3658 
3659 	return KERN_SUCCESS;
3660 }
3661 
3662 
3663 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3664 pmap_tt_deallocate(
3665 	pmap_t pmap,
3666 	tt_entry_t *ttp,
3667 	unsigned int level)
3668 {
3669 	pt_desc_t *ptdp;
3670 	ptd_info_t *ptd_info;
3671 	unsigned pt_acc_cnt;
3672 	unsigned i;
3673 	vm_offset_t     free_page = 0;
3674 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3675 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3676 
3677 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3678 
3679 	ptdp = ptep_get_ptd(ttp);
3680 	ptd_info = ptd_get_info(ptdp, ttp);
3681 
3682 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3683 
3684 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3685 		ptd_info->refcnt = 0;
3686 	}
3687 
3688 	if (__improbable(ptd_info->refcnt != 0)) {
3689 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3690 	}
3691 
3692 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3693 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3694 	}
3695 
3696 	if (pt_acc_cnt == 0) {
3697 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3698 		unsigned pt_free_entry_cnt = 1;
3699 
3700 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3701 			tt_free_entry_t *tt_free_list_next;
3702 
3703 			tt_free_list_next = tt_free_list->next;
3704 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3705 				pt_free_entry_cnt++;
3706 			}
3707 			tt_free_list = tt_free_list_next;
3708 		}
3709 		if (pt_free_entry_cnt == max_pt_index) {
3710 			tt_free_entry_t *tt_free_list_cur;
3711 
3712 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3713 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3714 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3715 
3716 			while (tt_free_list_cur) {
3717 				tt_free_entry_t *tt_free_list_next;
3718 
3719 				tt_free_list_next = tt_free_list_cur->next;
3720 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3721 					tt_free_list->next = tt_free_list_next->next;
3722 				} else {
3723 					tt_free_list = tt_free_list_next;
3724 				}
3725 				tt_free_list_cur = tt_free_list_next;
3726 			}
3727 		} else {
3728 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3729 			pmap->tt_entry_free = ttp;
3730 		}
3731 	} else {
3732 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3733 		pmap->tt_entry_free = ttp;
3734 	}
3735 
3736 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3737 
3738 	if (free_page != 0) {
3739 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3740 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3741 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3742 		if (level < pt_attr_leaf_level(pt_attr)) {
3743 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3744 		} else {
3745 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3746 		}
3747 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3748 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3749 	}
3750 }
3751 
3752 /**
3753  * Safely clear out a translation table entry.
3754  *
3755  * @note If the TTE to clear out points to a leaf table, then that leaf table
3756  *       must have a refcnt of zero before the TTE can be removed.
3757  * @note This function expects to be called with pmap locked exclusive, and will
3758  *       return with pmap unlocked.
3759  *
3760  * @param pmap The pmap containing the page table whose TTE is being removed.
3761  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3762  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3763  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3764  * @param ttep Pointer to the TTE that should be cleared out.
3765  * @param level The level of the page table that contains the TTE to be removed.
3766  */
3767 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3768 pmap_tte_remove(
3769 	pmap_t pmap,
3770 	vm_offset_t va_start,
3771 	vm_offset_t va_end,
3772 	bool need_strong_sync,
3773 	tt_entry_t *ttep,
3774 	unsigned int level)
3775 {
3776 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3777 
3778 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3779 	const tt_entry_t tte = *ttep;
3780 
3781 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3782 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3783 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3784 	}
3785 
3786 	*ttep = (tt_entry_t) 0;
3787 	FLUSH_PTE_STRONG();
3788 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3789 	if (va_end > va_start) {
3790 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3791 	}
3792 
3793 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3794 
3795 	/**
3796 	 * Remember, the passed in "level" parameter refers to the level above the
3797 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3798 	 * page table).
3799 	 */
3800 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3801 
3802 	/**
3803 	 * Non-leaf pagetables don't track active references in the PTD and instead
3804 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3805 	 * the real refcount below.
3806 	 */
3807 	unsigned short refcnt = PT_DESC_REFCOUNT;
3808 
3809 	/*
3810 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3811 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3812 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3813 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3814 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3815 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3816 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3817 	 * synchronize it against the disconnect operation.  If that removal caused the
3818 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3819 	 * operation is finished using the relevant pagetable descriptor.
3820 	 * Address these cases by waiting until all CPUs have been observed to not be
3821 	 * executing pmap_disconnect().
3822 	 */
3823 	if (remove_leaf_table) {
3824 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3825 		const int max_cpu = ml_get_max_cpu_number();
3826 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3827 		bool inflight_disconnect;
3828 
3829 		/*
3830 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3831 		 * ahead of any prior PTE load which may have observed the effect of a
3832 		 * concurrent disconnect operation.  An acquire fence is required for this;
3833 		 * a load-acquire operation is insufficient.
3834 		 */
3835 		os_atomic_thread_fence(acquire);
3836 		do {
3837 			inflight_disconnect = false;
3838 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3839 			    i >= 0;
3840 			    i = bitmap_next(&active_disconnects[0], i)) {
3841 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3842 				if (cpu_data == NULL) {
3843 					continue;
3844 				}
3845 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3846 					__builtin_arm_wfe();
3847 					inflight_disconnect = true;
3848 					continue;
3849 				}
3850 				os_atomic_clear_exclusive();
3851 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3852 			}
3853 		} while (inflight_disconnect);
3854 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3855 		os_atomic_thread_fence(acquire);
3856 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3857 	}
3858 
3859 #if MACH_ASSERT
3860 	/**
3861 	 * On internal devices, always do the page table consistency check
3862 	 * regardless of page table level or the actual refcnt value.
3863 	 */
3864 	{
3865 #else /* MACH_ASSERT */
3866 	/**
3867 	 * Only perform the page table consistency check when deleting leaf page
3868 	 * tables and it seems like there might be valid/compressed mappings
3869 	 * leftover.
3870 	 */
3871 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3872 #endif /* MACH_ASSERT */
3873 
3874 		/**
3875 		 * There are multiple problems that can arise as a non-zero refcnt:
3876 		 * 1. A bug in the refcnt management logic.
3877 		 * 2. A memory stomper or hardware failure.
3878 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3879 		 *    space before destroying a pmap.
3880 		 *
3881 		 * By looping over the page table and determining how many valid or
3882 		 * compressed entries there actually are, we can narrow down which of
3883 		 * these three cases is causing this panic. If the expected refcnt
3884 		 * (valid + compressed) and the actual refcnt don't match then the
3885 		 * problem is probably either a memory corruption issue (if the
3886 		 * non-empty entries don't match valid+compressed, that could also be a
3887 		 * sign of corruption) or refcnt management bug. Otherwise, there
3888 		 * actually are leftover mappings and the higher layers of xnu are
3889 		 * probably at fault.
3890 		 */
3891 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3892 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3893 
3894 		pt_entry_t *ptep = bpte;
3895 		unsigned short non_empty = 0, valid = 0, comp = 0;
3896 
3897 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3898 			/**
3899 			 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3900 			 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3901 			 * That's because it's possible for the 4-tuple PTE clear operation in
3902 			 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3903 			 * pmap_disconnect() to race each other in such a way that the compressed marker
3904 			 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3905 			 * This should be harmless as only the 1st PTE is used for accounting purposes,
3906 			 * but we don't want it to trip our internal checks here.
3907 			 */
3908 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3909 				if ((i % PAGE_RATIO) == 0) {
3910 					comp++;
3911 				} else {
3912 					continue;
3913 				}
3914 			} else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3915 				valid++;
3916 			}
3917 
3918 			/* Keep track of all non-empty entries to detect memory corruption. */
3919 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3920 				non_empty++;
3921 			}
3922 		}
3923 
3924 #if MACH_ASSERT
3925 		/**
3926 		 * On internal machines, panic whenever a page table getting deleted has
3927 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3928 		 * non-zero refcnt.
3929 		 */
3930 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3931 #else /* MACH_ASSERT */
3932 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3933 		{
3934 #endif /* MACH_ASSERT */
3935 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3936 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3937 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3938 		}
3939 	}
3940 }
3941 
3942 /**
3943  * Given a pointer to an entry within a `level` page table, delete the
3944  * page table at `level` + 1 that is represented by that entry. For instance,
3945  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3946  * contains the PA of the L3 table, and `level` would be "2".
3947  *
3948  * @note If the table getting deallocated is a leaf table, then that leaf table
3949  *       must have a refcnt of zero before getting deallocated. All other levels
3950  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3951  * @note This function expects to be called with pmap locked exclusive and will
3952  *       return with pmap unlocked.
3953  *
3954  * @param pmap The pmap that owns the page table to be deallocated.
3955  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3956  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3957  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3958  * @param ttep Pointer to the `level` TTE to remove.
3959  * @param level The level of the table that contains an entry pointing to the
3960  *              table to be removed. The deallocated page table will be a
3961  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
3962  *              deleted).
3963  */
3964 void
3965 pmap_tte_deallocate(
3966 	pmap_t pmap,
3967 	vm_offset_t va_start,
3968 	vm_offset_t va_end,
3969 	bool need_strong_sync,
3970 	tt_entry_t *ttep,
3971 	unsigned int level)
3972 {
3973 	tt_entry_t tte;
3974 
3975 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3976 
3977 	tte = *ttep;
3978 
3979 	if (tte_get_ptd(tte)->pmap != pmap) {
3980 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3981 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3982 	}
3983 
3984 	assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3985 	    __func__, ttep, (unsigned long long)tte);
3986 
3987 	/* pmap_tte_remove() will drop the pmap lock */
3988 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3989 
3990 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3991 }
3992 
3993 /*
3994  *	Remove a range of hardware page-table entries.
3995  *	The entries given are the first (inclusive)
3996  *	and last (exclusive) entries for the VM pages.
3997  *	The virtual address is the va for the first pte.
3998  *
3999  *	The pmap must be locked.
4000  *	If the pmap is not the kernel pmap, the range must lie
4001  *	entirely within one pte-page.  This is NOT checked.
4002  *	Assumes that the pte-page exists.
4003  *
4004  *	Returns the number of PTE changed
4005  */
4006 MARK_AS_PMAP_TEXT static int
4007 pmap_remove_range(
4008 	pmap_t pmap,
4009 	vm_map_address_t va,
4010 	pt_entry_t *bpte,
4011 	pt_entry_t *epte)
4012 {
4013 	bool need_strong_sync = false;
4014 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4015 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
4016 	if (num_changed > 0) {
4017 		PMAP_UPDATE_TLBS(pmap, va,
4018 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4019 	}
4020 	return num_changed;
4021 }
4022 
4023 
4024 #ifdef PVH_FLAG_EXEC
4025 
4026 /*
4027  *	Update the access protection bits of the physical aperture mapping for a page.
4028  *	This is useful, for example, in guranteeing that a verified executable page
4029  *	has no writable mappings anywhere in the system, including the physical
4030  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
4031  *	synchronization overhead in cases where the call to this function is
4032  *	guaranteed to be followed by other TLB operations.
4033  */
4034 void
4035 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4036 {
4037 #if __ARM_PTE_PHYSMAP__
4038 	pvh_assert_locked(pai);
4039 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4040 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4041 
4042 	pt_entry_t tmplate = *pte_p;
4043 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4044 		return;
4045 	}
4046 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4047 	if (tmplate & ARM_PTE_HINT_MASK) {
4048 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4049 		    __func__, pte_p, (void *)kva, tmplate);
4050 	}
4051 	write_pte_strong(pte_p, tmplate);
4052 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4053 	if (!flush_tlb_async) {
4054 		sync_tlb_flush();
4055 	}
4056 #endif
4057 }
4058 #endif /* defined(PVH_FLAG_EXEC) */
4059 
4060 
4061 
4062 MARK_AS_PMAP_TEXT int
4063 pmap_remove_range_options(
4064 	pmap_t pmap,
4065 	vm_map_address_t va,
4066 	pt_entry_t *bpte,
4067 	pt_entry_t *epte,
4068 	vm_map_address_t *eva,
4069 	bool *need_strong_sync __unused,
4070 	int options)
4071 {
4072 	pt_entry_t     *cpte;
4073 	size_t          npages = 0;
4074 	int             num_removed, num_unwired;
4075 	int             num_pte_changed;
4076 	unsigned int    pai = 0;
4077 	pmap_paddr_t    pa;
4078 	int             num_external, num_internal, num_reusable;
4079 	int             num_alt_internal;
4080 	uint64_t        num_compressed, num_alt_compressed;
4081 	int16_t         refcnt = 0;
4082 
4083 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4084 
4085 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4086 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4087 
4088 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4089 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4090 	}
4091 
4092 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4093 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4094 	}
4095 
4096 	num_removed = 0;
4097 	num_unwired = 0;
4098 	num_pte_changed = 0;
4099 	num_external = 0;
4100 	num_internal = 0;
4101 	num_reusable = 0;
4102 	num_compressed = 0;
4103 	num_alt_internal = 0;
4104 	num_alt_compressed = 0;
4105 
4106 #if XNU_MONITOR
4107 	bool ro_va = false;
4108 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4109 		ro_va = true;
4110 	}
4111 #endif
4112 	for (cpte = bpte; cpte < epte;
4113 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4114 		pt_entry_t      spte;
4115 		boolean_t       managed = FALSE;
4116 
4117 		/*
4118 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4119 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4120 		 */
4121 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4122 			*eva = va;
4123 			break;
4124 		}
4125 
4126 		spte = *((volatile pt_entry_t*)cpte);
4127 
4128 		while (!managed) {
4129 			if (pmap != kernel_pmap &&
4130 			    (options & PMAP_OPTIONS_REMOVE) &&
4131 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4132 				/*
4133 				 * "pmap" must be locked at this point,
4134 				 * so this should not race with another
4135 				 * pmap_remove_range() or pmap_enter().
4136 				 */
4137 
4138 				/* one less "compressed"... */
4139 				num_compressed++;
4140 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4141 					/* ... but it used to be "ALTACCT" */
4142 					num_alt_compressed++;
4143 				}
4144 
4145 				/* clear marker */
4146 				write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4147 				/*
4148 				 * "refcnt" also accounts for
4149 				 * our "compressed" markers,
4150 				 * so let's update it here.
4151 				 */
4152 				--refcnt;
4153 				spte = *((volatile pt_entry_t*)cpte);
4154 			}
4155 			/*
4156 			 * It may be possible for the pte to transition from managed
4157 			 * to unmanaged in this timeframe; for now, elide the assert.
4158 			 * We should break out as a consequence of checking pa_valid.
4159 			 */
4160 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4161 			pa = pte_to_pa(spte);
4162 			if (!pa_valid(pa)) {
4163 #if XNU_MONITOR
4164 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4165 #endif
4166 #if XNU_MONITOR
4167 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4168 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4169 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4170 					    __func__, (uint64_t)pa);
4171 				}
4172 #endif
4173 				break;
4174 			}
4175 #if HAS_FEAT_XS
4176 			if (pte_is_xs(pt_attr, spte)) {
4177 				*need_strong_sync = true;
4178 			}
4179 #endif /* HAS_FEAT_XS */
4180 			pai = pa_index(pa);
4181 			pvh_lock(pai);
4182 			spte = *((volatile pt_entry_t*)cpte);
4183 			pa = pte_to_pa(spte);
4184 			if (pai == pa_index(pa)) {
4185 				managed = TRUE;
4186 				break; // Leave pai locked as we will unlock it after we free the PV entry
4187 			}
4188 			pvh_unlock(pai);
4189 		}
4190 
4191 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4192 			/*
4193 			 * There used to be a valid mapping here but it
4194 			 * has already been removed when the page was
4195 			 * sent to the VM compressor, so nothing left to
4196 			 * remove now...
4197 			 */
4198 			continue;
4199 		}
4200 
4201 		/* remove the translation, do not flush the TLB */
4202 		if (*cpte != ARM_PTE_TYPE_FAULT) {
4203 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4204 			assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4205 #if MACH_ASSERT
4206 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4207 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4208 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4209 			}
4210 #endif
4211 			write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4212 			num_pte_changed++;
4213 		}
4214 
4215 		if ((spte != ARM_PTE_TYPE_FAULT) &&
4216 		    (pmap != kernel_pmap)) {
4217 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4218 			assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4219 			--refcnt;
4220 		}
4221 
4222 		if (pte_is_wired(spte)) {
4223 			pte_set_wired(pmap, cpte, 0);
4224 			num_unwired++;
4225 		}
4226 		/*
4227 		 * if not managed, we're done
4228 		 */
4229 		if (!managed) {
4230 			continue;
4231 		}
4232 
4233 #if XNU_MONITOR
4234 		if (__improbable(ro_va)) {
4235 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4236 		}
4237 #endif
4238 
4239 		/*
4240 		 * find and remove the mapping from the chain for this
4241 		 * physical address.
4242 		 */
4243 		bool is_internal, is_altacct;
4244 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4245 
4246 		if (is_altacct) {
4247 			assert(is_internal);
4248 			num_internal++;
4249 			num_alt_internal++;
4250 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4251 				ppattr_clear_altacct(pai);
4252 				ppattr_clear_internal(pai);
4253 			}
4254 		} else if (is_internal) {
4255 			if (ppattr_test_reusable(pai)) {
4256 				num_reusable++;
4257 			} else {
4258 				num_internal++;
4259 			}
4260 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4261 				ppattr_clear_internal(pai);
4262 			}
4263 		} else {
4264 			num_external++;
4265 		}
4266 		pvh_unlock(pai);
4267 		num_removed++;
4268 	}
4269 
4270 	/*
4271 	 *	Update the counts
4272 	 */
4273 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4274 
4275 	if (pmap != kernel_pmap) {
4276 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4277 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4278 		}
4279 
4280 		/* update ledgers */
4281 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4282 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4283 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4284 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4285 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4286 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4287 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4288 		/* make needed adjustments to phys_footprint */
4289 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4290 		    ((num_internal -
4291 		    num_alt_internal) +
4292 		    (num_compressed -
4293 		    num_alt_compressed)) * pmap_page_size);
4294 	}
4295 
4296 	/* flush the ptable entries we have written */
4297 	if (num_pte_changed > 0) {
4298 		FLUSH_PTE_STRONG();
4299 	}
4300 
4301 	return num_pte_changed;
4302 }
4303 
4304 
4305 /*
4306  *	Remove the given range of addresses
4307  *	from the specified map.
4308  *
4309  *	It is assumed that the start and end are properly
4310  *	rounded to the hardware page size.
4311  */
4312 void
4313 pmap_remove(
4314 	pmap_t pmap,
4315 	vm_map_address_t start,
4316 	vm_map_address_t end)
4317 {
4318 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4319 }
4320 
4321 MARK_AS_PMAP_TEXT vm_map_address_t
4322 pmap_remove_options_internal(
4323 	pmap_t pmap,
4324 	vm_map_address_t start,
4325 	vm_map_address_t end,
4326 	int options)
4327 {
4328 	vm_map_address_t eva = end;
4329 	pt_entry_t     *bpte, *epte;
4330 	pt_entry_t     *pte_p;
4331 	tt_entry_t     *tte_p;
4332 	int             remove_count = 0;
4333 	bool            need_strong_sync = false;
4334 	bool            unlock = true;
4335 
4336 	validate_pmap_mutable(pmap);
4337 
4338 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4339 
4340 	if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4341 		panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4342 	}
4343 
4344 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4345 
4346 	tte_p = pmap_tte(pmap, start);
4347 
4348 	if (tte_p == (tt_entry_t *) NULL) {
4349 		goto done;
4350 	}
4351 
4352 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4353 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4354 		bpte = &pte_p[pte_index(pt_attr, start)];
4355 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4356 
4357 		/*
4358 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4359 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4360 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4361 		 */
4362 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4363 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4364 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4365 		}
4366 
4367 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4368 		    &need_strong_sync, options);
4369 
4370 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4371 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4372 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4373 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4374 		}
4375 	}
4376 
4377 done:
4378 	if (unlock) {
4379 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4380 	}
4381 
4382 	if (remove_count > 0) {
4383 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4384 	}
4385 	return eva;
4386 }
4387 
4388 void
4389 pmap_remove_options(
4390 	pmap_t pmap,
4391 	vm_map_address_t start,
4392 	vm_map_address_t end,
4393 	int options)
4394 {
4395 	vm_map_address_t va;
4396 
4397 	if (pmap == PMAP_NULL) {
4398 		return;
4399 	}
4400 
4401 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4402 
4403 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4404 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4405 	    VM_KERNEL_ADDRHIDE(end));
4406 
4407 	/*
4408 	 * We allow single-page requests to execute non-preemptibly,
4409 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4410 	 * operation, and there are a couple of special use cases that
4411 	 * require a non-preemptible single-page operation.
4412 	 */
4413 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4414 		pmap_verify_preemptible();
4415 	}
4416 
4417 	/*
4418 	 *      Invalidate the translation buffer first
4419 	 */
4420 	va = start;
4421 	while (va < end) {
4422 		vm_map_address_t l;
4423 
4424 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4425 		if (l > end) {
4426 			l = end;
4427 		}
4428 
4429 #if XNU_MONITOR
4430 		va = pmap_remove_options_ppl(pmap, va, l, options);
4431 
4432 		pmap_ledger_check_balance(pmap);
4433 #else
4434 		va = pmap_remove_options_internal(pmap, va, l, options);
4435 #endif
4436 	}
4437 
4438 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4439 }
4440 
4441 
4442 /*
4443  *	Remove phys addr if mapped in specified map
4444  */
4445 void
4446 pmap_remove_some_phys(
4447 	__unused pmap_t map,
4448 	__unused ppnum_t pn)
4449 {
4450 	/* Implement to support working set code */
4451 }
4452 
4453 /*
4454  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4455  * switch a thread onto a new vm_map.
4456  */
4457 void
4458 pmap_switch_user(thread_t thread, vm_map_t new_map)
4459 {
4460 	pmap_t new_pmap = new_map->pmap;
4461 
4462 
4463 	thread->map = new_map;
4464 	pmap_set_pmap(new_pmap, thread);
4465 
4466 }
4467 
4468 void
4469 pmap_set_pmap(
4470 	pmap_t pmap,
4471 #if     !__ARM_USER_PROTECT__
4472 	__unused
4473 #endif
4474 	thread_t        thread)
4475 {
4476 	pmap_switch(pmap);
4477 #if __ARM_USER_PROTECT__
4478 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4479 	thread->machine.asid = pmap->hw_asid;
4480 #endif
4481 }
4482 
4483 static void
4484 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4485 {
4486 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4487 }
4488 
4489 static inline bool
4490 pmap_user_ttb_is_clear(void)
4491 {
4492 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4493 }
4494 
4495 MARK_AS_PMAP_TEXT void
4496 pmap_switch_internal(
4497 	pmap_t pmap)
4498 {
4499 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4500 #if XNU_MONITOR
4501 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4502 #endif
4503 	validate_pmap_mutable(pmap);
4504 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4505 	uint16_t asid_index = pmap->hw_asid;
4506 	bool do_asid_flush = false;
4507 	bool do_commpage_flush = false;
4508 
4509 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4510 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4511 	}
4512 #if __ARM_KERNEL_PROTECT__
4513 	asid_index >>= 1;
4514 #endif
4515 
4516 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4517 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4518 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4519 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4520 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4521 	bool break_before_make = do_shared_region_flush;
4522 
4523 #if !HAS_16BIT_ASID
4524 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4525 		asid_index -= 1;
4526 		pmap_update_plru(asid_index);
4527 
4528 		/* Paranoia. */
4529 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4530 
4531 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4532 		uint8_t new_sw_asid = pmap->sw_asid;
4533 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4534 
4535 		if (new_sw_asid != last_sw_asid) {
4536 			/*
4537 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4538 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4539 			 * then this switch runs the risk of aliasing.  We need to flush the
4540 			 * TLB for this phyiscal ASID in this case.
4541 			 */
4542 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4543 			do_asid_flush = true;
4544 			break_before_make = true;
4545 		}
4546 	}
4547 #endif /* !HAS_16BIT_ASID */
4548 
4549 #if __ARM_MIXED_PAGE_SIZE__
4550 	if (pt_attr->pta_tcr_value != get_tcr()) {
4551 		break_before_make = true;
4552 	}
4553 #endif
4554 #if __ARM_MIXED_PAGE_SIZE__
4555 	/*
4556 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4557 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4558 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4559 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4560 	 * conflict abort or other unpredictable behavior.
4561 	 */
4562 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4563 		do_commpage_flush = true;
4564 	}
4565 	if (do_commpage_flush) {
4566 		break_before_make = true;
4567 	}
4568 #endif
4569 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4570 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4571 		pmap_clear_user_ttb_internal();
4572 	}
4573 
4574 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4575 	 * to flush the userspace mappings for that region.  Those mappings are global
4576 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4577 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4578 	if (__improbable(do_shared_region_flush)) {
4579 #if __ARM_RANGE_TLBI__
4580 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4581 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4582 
4583 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4584 		 * There may still be non-global entries that overlap with the incoming pmap's
4585 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4586 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4587 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4588 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4589 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4590 		 * to consider additional invalidation here in the future. */
4591 		if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4592 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4593 		} else {
4594 			/*
4595 			 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4596 			 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4597 			 * have a single-page shared region anyway, not least because pmap_nest()
4598 			 * requires L2 block alignment of the address and size.
4599 			 */
4600 			do_asid_flush = false;
4601 			flush_core_tlb_async();
4602 		}
4603 #else
4604 		do_asid_flush = false;
4605 		flush_core_tlb_async();
4606 #endif // __ARM_RANGE_TLBI__
4607 	}
4608 
4609 #if __ARM_MIXED_PAGE_SIZE__
4610 	if (__improbable(do_commpage_flush)) {
4611 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4612 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4613 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4614 		flush_core_tlb_allrange_async(rtlbi_param);
4615 	}
4616 #endif
4617 	if (__improbable(do_asid_flush)) {
4618 		pmap_flush_core_tlb_asid_async(pmap);
4619 #if DEVELOPMENT || DEBUG
4620 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4621 #endif
4622 	}
4623 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4624 		sync_tlb_flush_local();
4625 	}
4626 
4627 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4628 }
4629 
4630 void
4631 pmap_switch(
4632 	pmap_t pmap)
4633 {
4634 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4635 #if XNU_MONITOR
4636 	pmap_switch_ppl(pmap);
4637 #else
4638 	pmap_switch_internal(pmap);
4639 #endif
4640 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4641 }
4642 
4643 void
4644 pmap_page_protect(
4645 	ppnum_t ppnum,
4646 	vm_prot_t prot)
4647 {
4648 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4649 }
4650 
4651 /*
4652  *	Routine:	pmap_page_protect_options
4653  *
4654  *	Function:
4655  *		Lower the permission for all mappings to a given
4656  *		page.
4657  */
4658 MARK_AS_PMAP_TEXT static void
4659 pmap_page_protect_options_with_flush_range(
4660 	ppnum_t ppnum,
4661 	vm_prot_t prot,
4662 	unsigned int options,
4663 	pmap_tlb_flush_range_t *flush_range)
4664 {
4665 	pmap_paddr_t    phys = ptoa(ppnum);
4666 	pv_entry_t    **pv_h;
4667 	pv_entry_t     *pve_p, *orig_pve_p;
4668 	pv_entry_t     *pveh_p;
4669 	pv_entry_t     *pvet_p;
4670 	pt_entry_t     *pte_p, *orig_pte_p;
4671 	pv_entry_t     *new_pve_p;
4672 	pt_entry_t     *new_pte_p;
4673 	vm_offset_t     pvh_flags;
4674 	unsigned int    pai;
4675 	bool            remove;
4676 	bool            set_NX;
4677 	unsigned int    pvh_cnt = 0;
4678 	unsigned int    pass1_updated = 0;
4679 	unsigned int    pass2_updated = 0;
4680 
4681 	assert(ppnum != vm_page_fictitious_addr);
4682 
4683 	/* Only work with managed pages. */
4684 	if (!pa_valid(phys)) {
4685 		return;
4686 	}
4687 
4688 	/*
4689 	 * Determine the new protection.
4690 	 */
4691 	switch (prot) {
4692 	case VM_PROT_ALL:
4693 		return;         /* nothing to do */
4694 	case VM_PROT_READ:
4695 	case VM_PROT_READ | VM_PROT_EXECUTE:
4696 		remove = false;
4697 		break;
4698 	default:
4699 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4700 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4701 		remove = true;
4702 		break;
4703 	}
4704 
4705 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4706 	if (remove) {
4707 #if !XNU_MONITOR
4708 		mp_disable_preemption();
4709 #endif
4710 		pmap_cpu_data = pmap_get_cpu_data();
4711 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4712 		/*
4713 		 * Ensure the store to inflight_disconnect will be observed before any of the
4714 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4715 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4716 		 * another CPU, in between this function's clearing a PTE and dropping the
4717 		 * corresponding pagetable refcount.  That can lead to a panic if the
4718 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4719 		 * store barrier; a store-release operation would not be sufficient.
4720 		 */
4721 		os_atomic_thread_fence(release);
4722 	}
4723 
4724 	pai = pa_index(phys);
4725 	pvh_lock(pai);
4726 	pv_h = pai_to_pvh(pai);
4727 	pvh_flags = pvh_get_flags(pv_h);
4728 
4729 #if XNU_MONITOR
4730 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4731 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4732 	}
4733 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4734 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4735 	}
4736 #endif
4737 
4738 
4739 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4740 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4741 	pveh_p = PV_ENTRY_NULL;
4742 	pvet_p = PV_ENTRY_NULL;
4743 	new_pve_p = PV_ENTRY_NULL;
4744 	new_pte_p = PT_ENTRY_NULL;
4745 
4746 
4747 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4748 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4749 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4750 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4751 		pveh_p = pve_p;
4752 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4753 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4754 	}
4755 
4756 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4757 	int pve_ptep_idx = 0;
4758 
4759 	/*
4760 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4761 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4762 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4763 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4764 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4765 	 * tlb_flush_needed to be true while issue_tlbi is false.
4766 	 */
4767 	bool issue_tlbi = false;
4768 	bool tlb_flush_needed = false;
4769 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4770 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4771 		pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4772 		bool update = false;
4773 
4774 		if (pve_p != PV_ENTRY_NULL) {
4775 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4776 			if (pte_p == PT_ENTRY_NULL) {
4777 				goto protect_skip_pve_pass1;
4778 			}
4779 		}
4780 
4781 #ifdef PVH_FLAG_IOMMU
4782 		if (pvh_ptep_is_iommu(pte_p)) {
4783 #if XNU_MONITOR
4784 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4785 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4786 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4787 			}
4788 #endif
4789 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4790 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4791 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4792 			}
4793 			goto protect_skip_pve_pass1;
4794 		}
4795 #endif
4796 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4797 		const pmap_t pmap = ptdp->pmap;
4798 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4799 
4800 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4801 #if MACH_ASSERT
4802 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4803 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4804 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4805 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4806 
4807 				pv_entry_t *check_pvep = pve_p;
4808 
4809 				do {
4810 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4811 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4812 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4813 					}
4814 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4815 
4816 				/* Restore previous PTEP value. */
4817 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4818 			}
4819 #endif
4820 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4821 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4822 		}
4823 
4824 #if DEVELOPMENT || DEBUG
4825 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4826 #else
4827 		if ((prot & VM_PROT_EXECUTE))
4828 #endif
4829 		{
4830 			set_NX = false;
4831 		} else {
4832 			set_NX = true;
4833 		}
4834 
4835 #if HAS_FEAT_XS
4836 		/**
4837 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4838 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4839 		 */
4840 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4841 #endif /* HAS_FEAT_XS */
4842 
4843 		/* Remove the mapping if new protection is NONE */
4844 		if (remove) {
4845 			if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4846 				panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
4847 				    __func__, pmap, ppnum);
4848 			}
4849 
4850 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4851 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4852 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4853 			pt_entry_t spte = *pte_p;
4854 
4855 			if (pte_is_wired(spte)) {
4856 				pte_set_wired(pmap, pte_p, 0);
4857 				spte = *pte_p;
4858 				if (pmap != kernel_pmap) {
4859 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4860 				}
4861 			}
4862 
4863 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4864 			    (uint64_t)spte, pte_p, ppnum);
4865 
4866 			if (compress && is_internal && (pmap != kernel_pmap)) {
4867 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4868 				/* mark this PTE as having been "compressed" */
4869 				tmplate = ARM_PTE_COMPRESSED;
4870 				if (is_altacct) {
4871 					tmplate |= ARM_PTE_COMPRESSED_ALT;
4872 				}
4873 			} else {
4874 				tmplate = ARM_PTE_TYPE_FAULT;
4875 			}
4876 
4877 			assert(spte != tmplate);
4878 			write_pte_fast(pte_p, tmplate);
4879 			update = true;
4880 			++pass1_updated;
4881 
4882 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4883 
4884 			if (pmap != kernel_pmap) {
4885 				if (ppattr_test_reusable(pai) &&
4886 				    is_internal &&
4887 				    !is_altacct) {
4888 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4889 				} else if (!is_internal) {
4890 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4891 				}
4892 
4893 				if (is_altacct) {
4894 					assert(is_internal);
4895 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4896 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4897 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4898 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4899 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4900 					}
4901 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4902 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4903 				} else if (ppattr_test_reusable(pai)) {
4904 					assert(is_internal);
4905 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4906 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4907 						/* was not in footprint, but is now */
4908 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4909 					}
4910 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4911 				} else if (is_internal) {
4912 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4913 
4914 					/*
4915 					 * Update all stats related to physical footprint, which only
4916 					 * deals with internal pages.
4917 					 */
4918 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4919 						/*
4920 						 * This removal is only being done so we can send this page to
4921 						 * the compressor; therefore it mustn't affect total task footprint.
4922 						 */
4923 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4924 					} else {
4925 						/*
4926 						 * This internal page isn't going to the compressor, so adjust stats to keep
4927 						 * phys_footprint up to date.
4928 						 */
4929 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4930 					}
4931 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4932 				} else {
4933 					/* external page: no impact on ledgers */
4934 				}
4935 			}
4936 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4937 		} else {
4938 			pt_entry_t spte = *pte_p;
4939 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4940 
4941 			if (pmap == kernel_pmap) {
4942 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4943 			} else {
4944 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4945 			}
4946 
4947 			/*
4948 			 * While the naive implementation of this would serve to add execute
4949 			 * permission, this is not how the VM uses this interface, or how
4950 			 * x86_64 implements it.  So ignore requests to add execute permissions.
4951 			 */
4952 			if (set_NX) {
4953 				tmplate |= pt_attr_leaf_xn(pt_attr);
4954 			}
4955 
4956 
4957 			assert(spte != ARM_PTE_TYPE_FAULT);
4958 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4959 
4960 			if (spte != tmplate) {
4961 				/*
4962 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4963 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4964 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
4965 				 * should always be cleared by this function.
4966 				 */
4967 				pte_set_was_writeable(tmplate, true);
4968 				write_pte_fast(pte_p, tmplate);
4969 				update = true;
4970 				++pass1_updated;
4971 			} else if (pte_was_writeable(tmplate)) {
4972 				/*
4973 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4974 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
4975 				 * write access to a page, this function should always at least clear that flag for
4976 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4977 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
4978 				 * be handled through arm_fast_fault().
4979 				 */
4980 				pte_set_was_writeable(tmplate, false);
4981 				write_pte_fast(pte_p, tmplate);
4982 			}
4983 		}
4984 
4985 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
4986 			tlb_flush_needed = true;
4987 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4988 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4989 				issue_tlbi = true;
4990 			}
4991 		}
4992 protect_skip_pve_pass1:
4993 		pte_p = PT_ENTRY_NULL;
4994 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4995 			pve_ptep_idx = 0;
4996 			pve_p = pve_next(pve_p);
4997 		}
4998 	}
4999 
5000 	if (tlb_flush_needed) {
5001 		FLUSH_PTE_STRONG();
5002 	}
5003 
5004 	if (!remove && !issue_tlbi) {
5005 		goto protect_finish;
5006 	}
5007 
5008 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5009 	pv_entry_t **pve_pp = pv_h;
5010 	pve_p = orig_pve_p;
5011 	pte_p = orig_pte_p;
5012 	pve_ptep_idx = 0;
5013 
5014 	/*
5015 	 * We need to keep track of whether a particular PVE list contains IOMMU
5016 	 * mappings when removing entries, because we should only remove CPU
5017 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5018 	 * it around.
5019 	 */
5020 	bool iommu_mapping_in_pve = false;
5021 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5022 		if (pve_p != PV_ENTRY_NULL) {
5023 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5024 			if (pte_p == PT_ENTRY_NULL) {
5025 				goto protect_skip_pve_pass2;
5026 			}
5027 		}
5028 
5029 #ifdef PVH_FLAG_IOMMU
5030 		if (pvh_ptep_is_iommu(pte_p)) {
5031 			iommu_mapping_in_pve = true;
5032 			if (remove && (pve_p == PV_ENTRY_NULL)) {
5033 				/*
5034 				 * We've found an IOMMU entry and it's the only entry in the PV list.
5035 				 * We don't discard IOMMU entries, so simply set up the new PV list to
5036 				 * contain the single IOMMU PTE and exit the loop.
5037 				 */
5038 				new_pte_p = pte_p;
5039 				break;
5040 			}
5041 			goto protect_skip_pve_pass2;
5042 		}
5043 #endif
5044 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5045 		const pmap_t pmap = ptdp->pmap;
5046 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5047 
5048 		if (remove) {
5049 			if (!compress && (pmap != kernel_pmap)) {
5050 				/*
5051 				 * We must wait to decrement the refcount until we're completely finished using the PTE
5052 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
5053 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5054 				 * under us.
5055 				 */
5056 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5057 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5058 				}
5059 			}
5060 			/* Remove this CPU mapping from PVE list. */
5061 			if (pve_p != PV_ENTRY_NULL) {
5062 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5063 			}
5064 		} else {
5065 			pt_entry_t spte = *pte_p;
5066 			if (pte_was_writeable(spte)) {
5067 				pte_set_was_writeable(spte, false);
5068 				write_pte_fast(pte_p, spte);
5069 			} else {
5070 				goto protect_skip_pve_pass2;
5071 			}
5072 		}
5073 		++pass2_updated;
5074 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5075 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5076 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5077 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5078 		}
5079 
5080 protect_skip_pve_pass2:
5081 		pte_p = PT_ENTRY_NULL;
5082 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5083 			pve_ptep_idx = 0;
5084 
5085 			if (remove) {
5086 				/**
5087 				 * If there are any IOMMU mappings in the PVE list, preserve
5088 				 * those mappings in a new PVE list (new_pve_p) which will later
5089 				 * become the new PVH entry. Keep track of the CPU mappings in
5090 				 * pveh_p/pvet_p so they can be deallocated later.
5091 				 */
5092 				if (iommu_mapping_in_pve) {
5093 					iommu_mapping_in_pve = false;
5094 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5095 					pve_remove(pv_h, pve_pp, pve_p);
5096 					pveh_p = pvh_pve_list(pv_h);
5097 					pve_p->pve_next = new_pve_p;
5098 					new_pve_p = pve_p;
5099 					pve_p = temp_pve_p;
5100 					continue;
5101 				} else {
5102 					pvet_p = pve_p;
5103 					pvh_cnt++;
5104 				}
5105 			}
5106 
5107 			pve_pp = pve_next_ptr(pve_p);
5108 			pve_p = pve_next(pve_p);
5109 			iommu_mapping_in_pve = false;
5110 		}
5111 	}
5112 
5113 protect_finish:
5114 
5115 #ifdef PVH_FLAG_EXEC
5116 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5117 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5118 	}
5119 #endif
5120 	if (__improbable(pass1_updated != pass2_updated)) {
5121 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5122 		    __func__, pass1_updated, pass2_updated);
5123 	}
5124 	/* if we removed a bunch of entries, take care of them now */
5125 	if (remove) {
5126 		if (new_pve_p != PV_ENTRY_NULL) {
5127 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5128 			pvh_set_flags(pv_h, pvh_flags);
5129 		} else if (new_pte_p != PT_ENTRY_NULL) {
5130 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5131 			pvh_set_flags(pv_h, pvh_flags);
5132 		} else {
5133 			if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5134 				pmap_flush_noncoherent_page(phys);
5135 			}
5136 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5137 		}
5138 	}
5139 
5140 	if (flush_range && tlb_flush_needed) {
5141 		if (!remove) {
5142 			flush_range->ptfr_flush_needed = true;
5143 			tlb_flush_needed = false;
5144 		}
5145 	}
5146 
5147 	/*
5148 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5149 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5150 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5151 	 * a page to be repurposed while it is still live in the TLBs.
5152 	 */
5153 	if (remove && tlb_flush_needed) {
5154 		sync_tlb_flush();
5155 	}
5156 
5157 
5158 	pvh_unlock(pai);
5159 
5160 	if (remove) {
5161 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5162 #if !XNU_MONITOR
5163 		mp_enable_preemption();
5164 #endif
5165 	}
5166 
5167 	if (!remove && tlb_flush_needed) {
5168 		sync_tlb_flush();
5169 	}
5170 
5171 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5172 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5173 	}
5174 }
5175 
5176 MARK_AS_PMAP_TEXT void
5177 pmap_page_protect_options_internal(
5178 	ppnum_t ppnum,
5179 	vm_prot_t prot,
5180 	unsigned int options,
5181 	void *arg)
5182 {
5183 	if (arg != NULL) {
5184 		/*
5185 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5186 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5187 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5188 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5189 		 * In that case, force the flush to take place.
5190 		 */
5191 		options &= ~PMAP_OPTIONS_NOFLUSH;
5192 	}
5193 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5194 }
5195 
5196 void
5197 pmap_page_protect_options(
5198 	ppnum_t ppnum,
5199 	vm_prot_t prot,
5200 	unsigned int options,
5201 	void *arg)
5202 {
5203 	pmap_paddr_t    phys = ptoa(ppnum);
5204 
5205 	assert(ppnum != vm_page_fictitious_addr);
5206 
5207 	/* Only work with managed pages. */
5208 	if (!pa_valid(phys)) {
5209 		return;
5210 	}
5211 
5212 	/*
5213 	 * Determine the new protection.
5214 	 */
5215 	if (prot == VM_PROT_ALL) {
5216 		return;         /* nothing to do */
5217 	}
5218 
5219 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5220 
5221 #if XNU_MONITOR
5222 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5223 #else
5224 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5225 #endif
5226 
5227 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5228 }
5229 
5230 
5231 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5232 MARK_AS_PMAP_TEXT void
5233 pmap_disable_user_jop_internal(pmap_t pmap)
5234 {
5235 	if (pmap == kernel_pmap) {
5236 		panic("%s: called with kernel_pmap", __func__);
5237 	}
5238 	validate_pmap_mutable(pmap);
5239 	pmap->disable_jop = true;
5240 }
5241 
5242 void
5243 pmap_disable_user_jop(pmap_t pmap)
5244 {
5245 #if XNU_MONITOR
5246 	pmap_disable_user_jop_ppl(pmap);
5247 #else
5248 	pmap_disable_user_jop_internal(pmap);
5249 #endif
5250 }
5251 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5252 
5253 /*
5254  * Indicates if the pmap layer enforces some additional restrictions on the
5255  * given set of protections.
5256  */
5257 bool
5258 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5259 {
5260 	return false;
5261 }
5262 
5263 /*
5264  *	Set the physical protection on the
5265  *	specified range of this map as requested.
5266  *	VERY IMPORTANT: Will not increase permissions.
5267  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5268  */
5269 void
5270 pmap_protect(
5271 	pmap_t pmap,
5272 	vm_map_address_t b,
5273 	vm_map_address_t e,
5274 	vm_prot_t prot)
5275 {
5276 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5277 }
5278 
5279 MARK_AS_PMAP_TEXT vm_map_address_t
5280 pmap_protect_options_internal(
5281 	pmap_t pmap,
5282 	vm_map_address_t start,
5283 	vm_map_address_t end,
5284 	vm_prot_t prot,
5285 	unsigned int options,
5286 	__unused void *args)
5287 {
5288 	tt_entry_t      *tte_p;
5289 	pt_entry_t      *bpte_p, *epte_p;
5290 	pt_entry_t      *pte_p;
5291 	boolean_t        set_NX = TRUE;
5292 	boolean_t        set_XO = FALSE;
5293 	boolean_t        should_have_removed = FALSE;
5294 	bool             need_strong_sync = false;
5295 
5296 	/* Validate the pmap input before accessing its data. */
5297 	validate_pmap_mutable(pmap);
5298 
5299 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5300 
5301 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5302 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5303 	}
5304 
5305 #if DEVELOPMENT || DEBUG
5306 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5307 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5308 			should_have_removed = TRUE;
5309 		}
5310 	} else
5311 #endif
5312 	{
5313 		/* Determine the new protection. */
5314 		switch (prot) {
5315 		case VM_PROT_EXECUTE:
5316 			set_XO = TRUE;
5317 			OS_FALLTHROUGH;
5318 		case VM_PROT_READ:
5319 		case VM_PROT_READ | VM_PROT_EXECUTE:
5320 			break;
5321 		case VM_PROT_READ | VM_PROT_WRITE:
5322 		case VM_PROT_ALL:
5323 			return end;         /* nothing to do */
5324 		default:
5325 			should_have_removed = TRUE;
5326 		}
5327 	}
5328 
5329 	if (should_have_removed) {
5330 		panic("%s: should have been a remove operation, "
5331 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5332 		    __FUNCTION__,
5333 		    pmap, (void *)start, (void *)end, prot, options, args);
5334 	}
5335 
5336 #if DEVELOPMENT || DEBUG
5337 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5338 #else
5339 	if ((prot & VM_PROT_EXECUTE))
5340 #endif
5341 	{
5342 		set_NX = FALSE;
5343 	} else {
5344 		set_NX = TRUE;
5345 	}
5346 
5347 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5348 	vm_map_address_t va = start;
5349 	unsigned int npages = 0;
5350 
5351 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5352 
5353 	tte_p = pmap_tte(pmap, start);
5354 
5355 	if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5356 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5357 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5358 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5359 		pte_p = bpte_p;
5360 
5361 		for (pte_p = bpte_p;
5362 		    pte_p < epte_p;
5363 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5364 			++npages;
5365 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5366 			    pmap_pending_preemption())) {
5367 				break;
5368 			}
5369 			pt_entry_t spte;
5370 #if DEVELOPMENT || DEBUG
5371 			boolean_t  force_write = FALSE;
5372 #endif
5373 
5374 			spte = *((volatile pt_entry_t*)pte_p);
5375 
5376 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5377 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5378 				continue;
5379 			}
5380 
5381 			pmap_paddr_t    pa;
5382 			unsigned int    pai = 0;
5383 			boolean_t       managed = FALSE;
5384 
5385 			while (!managed) {
5386 				/*
5387 				 * It may be possible for the pte to transition from managed
5388 				 * to unmanaged in this timeframe; for now, elide the assert.
5389 				 * We should break out as a consequence of checking pa_valid.
5390 				 */
5391 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5392 				pa = pte_to_pa(spte);
5393 				if (!pa_valid(pa)) {
5394 					break;
5395 				}
5396 				pai = pa_index(pa);
5397 				pvh_lock(pai);
5398 				spte = *((volatile pt_entry_t*)pte_p);
5399 				pa = pte_to_pa(spte);
5400 				if (pai == pa_index(pa)) {
5401 					managed = TRUE;
5402 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5403 				}
5404 				pvh_unlock(pai);
5405 			}
5406 
5407 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5408 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5409 				continue;
5410 			}
5411 
5412 			pt_entry_t      tmplate;
5413 
5414 			if (pmap == kernel_pmap) {
5415 #if DEVELOPMENT || DEBUG
5416 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5417 					force_write = TRUE;
5418 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5419 				} else
5420 #endif
5421 				{
5422 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5423 				}
5424 			} else {
5425 #if DEVELOPMENT || DEBUG
5426 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5427 					assert(pmap->type != PMAP_TYPE_NESTED);
5428 					force_write = TRUE;
5429 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5430 				} else
5431 #endif
5432 				{
5433 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5434 				}
5435 			}
5436 
5437 			/*
5438 			 * XXX Removing "NX" would
5439 			 * grant "execute" access
5440 			 * immediately, bypassing any
5441 			 * checks VM might want to do
5442 			 * in its soft fault path.
5443 			 * pmap_protect() and co. are
5444 			 * not allowed to increase
5445 			 * access permissions.
5446 			 */
5447 			if (set_NX) {
5448 				tmplate |= pt_attr_leaf_xn(pt_attr);
5449 			} else {
5450 				if (pmap == kernel_pmap) {
5451 					/* do NOT clear "PNX"! */
5452 					tmplate |= ARM_PTE_NX;
5453 				} else {
5454 					/* do NOT clear "NX"! */
5455 					tmplate |= pt_attr_leaf_x(pt_attr);
5456 					if (set_XO) {
5457 						tmplate &= ~ARM_PTE_APMASK;
5458 						tmplate |= pt_attr_leaf_rona(pt_attr);
5459 					}
5460 				}
5461 			}
5462 
5463 #if DEVELOPMENT || DEBUG
5464 			if (force_write) {
5465 				/*
5466 				 * TODO: Run CS/Monitor checks here.
5467 				 */
5468 				if (managed) {
5469 					/*
5470 					 * We are marking the page as writable,
5471 					 * so we consider it to be modified and
5472 					 * referenced.
5473 					 */
5474 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5475 					tmplate |= ARM_PTE_AF;
5476 
5477 					if (ppattr_test_reffault(pai)) {
5478 						ppattr_clear_reffault(pai);
5479 					}
5480 
5481 					if (ppattr_test_modfault(pai)) {
5482 						ppattr_clear_modfault(pai);
5483 					}
5484 				}
5485 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5486 				/*
5487 				 * An immediate request for anything other than
5488 				 * write should still mark the page as
5489 				 * referenced if managed.
5490 				 */
5491 				if (managed) {
5492 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5493 					tmplate |= ARM_PTE_AF;
5494 
5495 					if (ppattr_test_reffault(pai)) {
5496 						ppattr_clear_reffault(pai);
5497 					}
5498 				}
5499 			}
5500 #endif
5501 
5502 			/* We do not expect to write fast fault the entry. */
5503 			pte_set_was_writeable(tmplate, false);
5504 #if HAS_FEAT_XS
5505 			if (pte_is_xs(pt_attr, spte)) {
5506 				need_strong_sync = true;
5507 			}
5508 #endif /* HAS_FEAT_XS */
5509 
5510 			write_pte_fast(pte_p, tmplate);
5511 
5512 			if (managed) {
5513 				pvh_assert_locked(pai);
5514 				pvh_unlock(pai);
5515 			}
5516 		}
5517 		FLUSH_PTE_STRONG();
5518 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5519 	} else {
5520 		va = end;
5521 	}
5522 
5523 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5524 	return va;
5525 }
5526 
5527 void
5528 pmap_protect_options(
5529 	pmap_t pmap,
5530 	vm_map_address_t b,
5531 	vm_map_address_t e,
5532 	vm_prot_t prot,
5533 	unsigned int options,
5534 	__unused void *args)
5535 {
5536 	vm_map_address_t l, beg;
5537 
5538 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5539 
5540 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5541 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5542 		    pmap, (uint64_t)b, (uint64_t)e);
5543 	}
5544 
5545 	/*
5546 	 * We allow single-page requests to execute non-preemptibly,
5547 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5548 	 * operation, and there are a couple of special use cases that
5549 	 * require a non-preemptible single-page operation.
5550 	 */
5551 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5552 		pmap_verify_preemptible();
5553 	}
5554 
5555 #if DEVELOPMENT || DEBUG
5556 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5557 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5558 			pmap_remove_options(pmap, b, e, options);
5559 			return;
5560 		}
5561 	} else
5562 #endif
5563 	{
5564 		/* Determine the new protection. */
5565 		switch (prot) {
5566 		case VM_PROT_EXECUTE:
5567 		case VM_PROT_READ:
5568 		case VM_PROT_READ | VM_PROT_EXECUTE:
5569 			break;
5570 		case VM_PROT_READ | VM_PROT_WRITE:
5571 		case VM_PROT_ALL:
5572 			return;         /* nothing to do */
5573 		default:
5574 			pmap_remove_options(pmap, b, e, options);
5575 			return;
5576 		}
5577 	}
5578 
5579 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5580 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5581 	    VM_KERNEL_ADDRHIDE(e));
5582 
5583 	beg = b;
5584 
5585 	while (beg < e) {
5586 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5587 
5588 		if (l > e) {
5589 			l = e;
5590 		}
5591 
5592 #if XNU_MONITOR
5593 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5594 #else
5595 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5596 #endif
5597 	}
5598 
5599 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5600 }
5601 
5602 /**
5603  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5604  *
5605  * @param pmap pmap to insert the pages into.
5606  * @param va virtual address to map the pages into.
5607  * @param pa page number of the first physical page to map.
5608  * @param size block size, in number of pages.
5609  * @param prot mapping protection attributes.
5610  * @param attr flags to pass to pmap_enter().
5611  *
5612  * @return KERN_SUCCESS.
5613  */
5614 kern_return_t
5615 pmap_map_block(
5616 	pmap_t pmap,
5617 	addr64_t va,
5618 	ppnum_t pa,
5619 	uint32_t size,
5620 	vm_prot_t prot,
5621 	int attr,
5622 	unsigned int flags)
5623 {
5624 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5625 }
5626 
5627 /**
5628  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5629  * As opposed to pmap_map_block(), this function takes
5630  * a physical address as an input and operates using the
5631  * page size associated with the input pmap.
5632  *
5633  * @param pmap pmap to insert the pages into.
5634  * @param va virtual address to map the pages into.
5635  * @param pa physical address of the first physical page to map.
5636  * @param size block size, in number of pages.
5637  * @param prot mapping protection attributes.
5638  * @param attr flags to pass to pmap_enter().
5639  *
5640  * @return KERN_SUCCESS.
5641  */
5642 kern_return_t
5643 pmap_map_block_addr(
5644 	pmap_t pmap,
5645 	addr64_t va,
5646 	pmap_paddr_t pa,
5647 	uint32_t size,
5648 	vm_prot_t prot,
5649 	int attr,
5650 	unsigned int flags)
5651 {
5652 #if __ARM_MIXED_PAGE_SIZE__
5653 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5654 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5655 #else
5656 	const uint64_t pmap_page_size = PAGE_SIZE;
5657 #endif
5658 
5659 	for (ppnum_t page = 0; page < size; page++) {
5660 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5661 			panic("%s: failed pmap_enter_addr, "
5662 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5663 			    __FUNCTION__,
5664 			    pmap, va, (uint64_t)pa, size, prot, flags);
5665 		}
5666 
5667 		va += pmap_page_size;
5668 		pa += pmap_page_size;
5669 	}
5670 
5671 	return KERN_SUCCESS;
5672 }
5673 
5674 kern_return_t
5675 pmap_enter_addr(
5676 	pmap_t pmap,
5677 	vm_map_address_t v,
5678 	pmap_paddr_t pa,
5679 	vm_prot_t prot,
5680 	vm_prot_t fault_type,
5681 	unsigned int flags,
5682 	boolean_t wired)
5683 {
5684 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5685 }
5686 
5687 /*
5688  *	Insert the given physical page (p) at
5689  *	the specified virtual address (v) in the
5690  *	target physical map with the protection requested.
5691  *
5692  *	If specified, the page will be wired down, meaning
5693  *	that the related pte can not be reclaimed.
5694  *
5695  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5696  *	or lose information.  That is, this routine must actually
5697  *	insert this page into the given map eventually (must make
5698  *	forward progress eventually.
5699  */
5700 kern_return_t
5701 pmap_enter(
5702 	pmap_t pmap,
5703 	vm_map_address_t v,
5704 	ppnum_t pn,
5705 	vm_prot_t prot,
5706 	vm_prot_t fault_type,
5707 	unsigned int flags,
5708 	boolean_t wired,
5709 	__unused pmap_mapping_type_t mapping_type)
5710 {
5711 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5712 }
5713 
5714 /*
5715  * Attempt to commit the pte.
5716  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5717  * Performs no page table or accounting writes on failures.
5718  */
5719 static inline bool
5720 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5721 {
5722 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5723 	bool success = false, changed_wiring = false;
5724 
5725 	__unreachable_ok_push
5726 	if (TEST_PAGE_RATIO_4) {
5727 		/*
5728 		 * 16K virtual pages w/ 4K hw pages.
5729 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5730 		 * As a result we require the exclusive pmap lock.
5731 		 */
5732 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5733 		*old_pte = *pte_p;
5734 		if (*old_pte == new_pte) {
5735 			/* Another thread completed this operation. Nothing to do here. */
5736 			success = true;
5737 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5738 		    (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5739 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5740 			success = false;
5741 		} else {
5742 			write_pte_fast(pte_p, new_pte);
5743 			success = true;
5744 		}
5745 	} else {
5746 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5747 	}
5748 	__unreachable_ok_pop
5749 
5750 	if (success && *old_pte != new_pte) {
5751 		if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5752 			bool need_strong_sync = false;
5753 			FLUSH_PTE_STRONG();
5754 #if HAS_FEAT_XS
5755 			if (pte_is_xs(pt_attr, *old_pte)) {
5756 				need_strong_sync = true;
5757 			}
5758 #endif /* HAS_FEAT_XS */
5759 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5760 		} else {
5761 			FLUSH_PTE();
5762 			__builtin_arm_isb(ISB_SY);
5763 		}
5764 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5765 		    (new_pte & ARM_PTE_WIRED) != 0 :
5766 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5767 
5768 		if (pmap != kernel_pmap && changed_wiring) {
5769 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5770 			if (new_pte & ARM_PTE_WIRED) {
5771 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5772 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5773 			} else {
5774 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5775 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5776 			}
5777 		}
5778 
5779 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5780 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5781 	}
5782 	return success;
5783 }
5784 
5785 MARK_AS_PMAP_TEXT static pt_entry_t
5786 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5787 {
5788 	pt_entry_t pte;
5789 
5790 	switch (wimg & (VM_WIMG_MASK)) {
5791 	case VM_WIMG_IO:
5792 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5793 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5794 		// AP, while preserving the security benefits of using device
5795 		// mapping against side-channel attacks. On pre-H14 platforms,
5796 		// the accesses will still be strongly ordered.
5797 		if (is_dram_addr(pa)) {
5798 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5799 		} else {
5800 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5801 		}
5802 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5803 		break;
5804 	case VM_WIMG_RT:
5805 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5806 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5807 		break;
5808 	case VM_WIMG_POSTED:
5809 		if (is_dram_addr(pa)) {
5810 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5811 		} else {
5812 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5813 		}
5814 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5815 		break;
5816 	case VM_WIMG_POSTED_REORDERED:
5817 		if (is_dram_addr(pa)) {
5818 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5819 		} else {
5820 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5821 		}
5822 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5823 		break;
5824 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5825 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5826 #if HAS_FEAT_XS
5827 		if (!is_dram_addr(pa)) {
5828 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5829 		}
5830 #endif /* HAS_FEAT_XS */
5831 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5832 		break;
5833 	case VM_WIMG_WCOMB:
5834 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5835 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5836 		break;
5837 	case VM_WIMG_WTHRU:
5838 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5839 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5840 		break;
5841 	case VM_WIMG_COPYBACK:
5842 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5843 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5844 		break;
5845 	case VM_WIMG_INNERWBACK:
5846 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5847 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5848 		break;
5849 	default:
5850 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5851 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5852 	}
5853 
5854 	return pte;
5855 }
5856 
5857 
5858 /*
5859  * Construct a PTE (and the physical page attributes) for the given virtual to
5860  * physical mapping.
5861  *
5862  * This function has no side effects and is safe to call so that it is safe to
5863  * call while attempting a pmap_enter transaction.
5864  */
5865 MARK_AS_PMAP_TEXT static pt_entry_t
5866 pmap_construct_pte(
5867 	const pmap_t pmap,
5868 	vm_map_address_t va,
5869 	pmap_paddr_t pa,
5870 	vm_prot_t prot,
5871 	vm_prot_t fault_type,
5872 	boolean_t wired,
5873 	const pt_attr_t* const pt_attr,
5874 	uint16_t *pp_attr_bits /* OUTPUT */
5875 	)
5876 {
5877 	bool set_NX = false, set_XO = false;
5878 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5879 	assert(pp_attr_bits != NULL);
5880 	*pp_attr_bits = 0;
5881 
5882 	if (wired) {
5883 		pte |= ARM_PTE_WIRED;
5884 	}
5885 
5886 #if DEVELOPMENT || DEBUG
5887 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5888 #else
5889 	if ((prot & VM_PROT_EXECUTE))
5890 #endif
5891 	{
5892 		set_NX = false;
5893 	} else {
5894 		set_NX = true;
5895 	}
5896 
5897 	if (prot == VM_PROT_EXECUTE) {
5898 		set_XO = true;
5899 	}
5900 
5901 	if (set_NX) {
5902 		pte |= pt_attr_leaf_xn(pt_attr);
5903 	} else {
5904 		if (pmap == kernel_pmap) {
5905 			pte |= ARM_PTE_NX;
5906 		} else {
5907 			pte |= pt_attr_leaf_x(pt_attr);
5908 		}
5909 	}
5910 
5911 	if (pmap == kernel_pmap) {
5912 #if __ARM_KERNEL_PROTECT__
5913 		pte |= ARM_PTE_NG;
5914 #endif /* __ARM_KERNEL_PROTECT__ */
5915 		if (prot & VM_PROT_WRITE) {
5916 			pte |= ARM_PTE_AP(AP_RWNA);
5917 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5918 		} else {
5919 			pte |= ARM_PTE_AP(AP_RONA);
5920 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5921 		}
5922 	} else {
5923 		if (pmap->type != PMAP_TYPE_NESTED) {
5924 			pte |= ARM_PTE_NG;
5925 		} else if ((pmap->nested_region_unnested_table_bitmap)
5926 		    && (va >= pmap->nested_region_addr)
5927 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5928 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
5929 
5930 			if ((pmap->nested_region_unnested_table_bitmap)
5931 			    && testbit(index, (int *)pmap->nested_region_unnested_table_bitmap)) {
5932 				pte |= ARM_PTE_NG;
5933 			}
5934 		}
5935 		if (prot & VM_PROT_WRITE) {
5936 			assert(pmap->type != PMAP_TYPE_NESTED);
5937 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5938 				if (fault_type & VM_PROT_WRITE) {
5939 					pte |= pt_attr_leaf_rw(pt_attr);
5940 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5941 				} else {
5942 					pte |= pt_attr_leaf_ro(pt_attr);
5943 					/*
5944 					 * Mark the page as MODFAULT so that a subsequent write
5945 					 * may be handled through arm_fast_fault().
5946 					 */
5947 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5948 					pte_set_was_writeable(pte, true);
5949 				}
5950 			} else {
5951 				pte |= pt_attr_leaf_rw(pt_attr);
5952 				*pp_attr_bits |= PP_ATTR_REFERENCED;
5953 			}
5954 		} else {
5955 			if (set_XO) {
5956 				pte |= pt_attr_leaf_rona(pt_attr);
5957 			} else {
5958 				pte |= pt_attr_leaf_ro(pt_attr);
5959 			}
5960 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5961 		}
5962 	}
5963 
5964 	pte |= ARM_PTE_AF;
5965 	return pte;
5966 }
5967 
5968 MARK_AS_PMAP_TEXT kern_return_t
5969 pmap_enter_options_internal(
5970 	pmap_t pmap,
5971 	vm_map_address_t v,
5972 	pmap_paddr_t pa,
5973 	vm_prot_t prot,
5974 	vm_prot_t fault_type,
5975 	unsigned int flags,
5976 	boolean_t wired,
5977 	unsigned int options)
5978 {
5979 	ppnum_t         pn = (ppnum_t)atop(pa);
5980 	pt_entry_t      pte;
5981 	pt_entry_t      spte;
5982 	pt_entry_t      *pte_p;
5983 	bool            refcnt_updated;
5984 	bool            wiredcnt_updated;
5985 	bool            ro_va = false;
5986 	unsigned int    wimg_bits;
5987 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
5988 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
5989 	kern_return_t   kr = KERN_SUCCESS;
5990 	uint16_t pp_attr_bits;
5991 	volatile uint16_t *refcnt;
5992 	volatile uint16_t *wiredcnt;
5993 	pv_free_list_t *local_pv_free;
5994 
5995 	validate_pmap_mutable(pmap);
5996 
5997 #if XNU_MONITOR
5998 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
5999 		panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6000 	}
6001 #endif
6002 
6003 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6004 
6005 	if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6006 		panic("%s: pmap %p v 0x%llx not page-aligned",
6007 		    __func__, pmap, (unsigned long long)v);
6008 	}
6009 
6010 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6011 		panic("%s: attempt to map out-of-bounds VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6012 	}
6013 
6014 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6015 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6016 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6017 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6018 	}
6019 
6020 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6021 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6022 		    pmap, (uint64_t)pa);
6023 	}
6024 
6025 	/* The PA should not extend beyond the architected physical address space */
6026 	pa &= ARM_PTE_PAGE_MASK;
6027 
6028 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6029 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6030 		extern vm_offset_t ctrr_test_page;
6031 		if (__probable(v != ctrr_test_page))
6032 #endif
6033 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6034 	}
6035 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6036 		if (__improbable(prot != VM_PROT_READ)) {
6037 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6038 			    __func__, (unsigned long long)v, prot);
6039 		}
6040 		ro_va = true;
6041 	}
6042 	assert(pn != vm_page_fictitious_addr);
6043 
6044 	refcnt_updated = false;
6045 	wiredcnt_updated = false;
6046 
6047 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6048 		/*
6049 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6050 		 *
6051 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6052 		 */
6053 		lock_mode = PMAP_LOCK_EXCLUSIVE;
6054 	}
6055 
6056 	if (!pmap_lock_preempt(pmap, lock_mode)) {
6057 		return KERN_ABORTED;
6058 	}
6059 
6060 	/*
6061 	 *	Expand pmap to include this pte.  Assume that
6062 	 *	pmap is always expanded to include enough hardware
6063 	 *	pages to map one VM page.
6064 	 */
6065 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6066 		/* Must unlock to expand the pmap. */
6067 		pmap_unlock(pmap, lock_mode);
6068 
6069 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6070 
6071 		if (kr != KERN_SUCCESS) {
6072 			return kr;
6073 		}
6074 
6075 		if (!pmap_lock_preempt(pmap, lock_mode)) {
6076 			return KERN_ABORTED;
6077 		}
6078 	}
6079 
6080 	if (options & PMAP_OPTIONS_NOENTER) {
6081 		pmap_unlock(pmap, lock_mode);
6082 		return KERN_SUCCESS;
6083 	}
6084 
6085 	/*
6086 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6087 	 * done via a cmpxchg loop.
6088 	 * We need to be careful about modifying non-local data structures before commiting
6089 	 * the new pte since we may need to re-do the transaction.
6090 	 */
6091 	spte = os_atomic_load(pte_p, relaxed);
6092 	while (!committed) {
6093 		refcnt = NULL;
6094 		wiredcnt = NULL;
6095 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6096 		had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6097 
6098 		if (pmap != kernel_pmap) {
6099 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
6100 			refcnt = &ptd_info->refcnt;
6101 			wiredcnt = &ptd_info->wiredcnt;
6102 			/*
6103 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6104 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6105 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6106 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6107 			 * have PTDs, so we can't use the check there.
6108 			 */
6109 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6110 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6111 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6112 			}
6113 			/*
6114 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
6115 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6116 			 * or acquire the pmap lock exclusive.
6117 			 */
6118 			if (!wiredcnt_updated) {
6119 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6120 				wiredcnt_updated = true;
6121 			}
6122 			if (!refcnt_updated) {
6123 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
6124 				refcnt_updated = true;
6125 				drop_refcnt = true;
6126 			}
6127 		}
6128 
6129 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6130 			/*
6131 			 * There is already a mapping here & it's for a different physical page.
6132 			 * First remove that mapping.
6133 			 *
6134 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6135 			 */
6136 			if (lock_mode == PMAP_LOCK_SHARED) {
6137 				if (pmap_lock_shared_to_exclusive(pmap)) {
6138 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6139 				} else {
6140 					/*
6141 					 * We failed to upgrade to an exclusive lock.
6142 					 * As a result we no longer hold the lock at all,
6143 					 * so we need to re-acquire it and restart the transaction.
6144 					 */
6145 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6146 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6147 					/* pmap might have changed after we dropped the lock. Try again. */
6148 					spte = os_atomic_load(pte_p, relaxed);
6149 					continue;
6150 				}
6151 			}
6152 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6153 			spte = ARM_PTE_TYPE_FAULT;
6154 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6155 		}
6156 
6157 		/*
6158 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6159 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6160 		 * read-write protection. The PMAP layer though still needs to use the right
6161 		 * index, which is the older XO-now-TPRO one and that is specially selected
6162 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6163 		 */
6164 		if (options & PMAP_OPTIONS_MAP_TPRO) {
6165 			if (__improbable(pmap == kernel_pmap)) {
6166 				panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6167 				    __func__);
6168 			}
6169 			pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6170 		} else {
6171 			pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6172 		}
6173 
6174 		if (pa_valid(pa)) {
6175 			unsigned int pai;
6176 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6177 
6178 			is_internal = FALSE;
6179 			is_altacct = FALSE;
6180 
6181 			pai = pa_index(pa);
6182 
6183 			pvh_lock(pai);
6184 
6185 			/*
6186 			 * Make sure that the current per-cpu PV free list has
6187 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6188 			 * if the transaction succeeds. We're either in the
6189 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6190 			 * Note that we can still be interrupted, but a primary
6191 			 * interrupt handler can never enter the pmap.
6192 			 */
6193 #if !XNU_MONITOR
6194 			assert(get_preemption_level() > 0);
6195 #endif
6196 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6197 			pv_entry_t **pv_h = pai_to_pvh(pai);
6198 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6199 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6200 
6201 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6202 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6203 				int new_allocated_pves = 0;
6204 
6205 				while (new_allocated_pves < 2) {
6206 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6207 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6208 					if (pv_status == PV_ALLOC_FAIL) {
6209 						break;
6210 					} else if (pv_status == PV_ALLOC_RETRY) {
6211 						/*
6212 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6213 						 * it will have dropped the pmap lock while doing so.
6214 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6215 						 * be on a different CPU now.
6216 						 */
6217 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6218 					} else {
6219 						/* If we've gotten this far then a node should've been allocated. */
6220 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6221 
6222 						new_allocated_pves++;
6223 					}
6224 				}
6225 
6226 				for (int i = 0; i < new_allocated_pves; i++) {
6227 					pv_free(new_pve_p[i]);
6228 				}
6229 			}
6230 
6231 			if (pv_status == PV_ALLOC_FAIL) {
6232 				pvh_unlock(pai);
6233 				kr = KERN_RESOURCE_SHORTAGE;
6234 				break;
6235 			} else if (pv_status == PV_ALLOC_RETRY) {
6236 				pvh_unlock(pai);
6237 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6238 				spte = os_atomic_load(pte_p, relaxed);
6239 				continue;
6240 			}
6241 
6242 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6243 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6244 			} else {
6245 				wimg_bits = pmap_cache_attributes(pn);
6246 			}
6247 
6248 			/* We may be retrying this operation after dropping the PVH lock.
6249 			 * Cache attributes for the physical page may have changed while the lock
6250 			 * was dropped, so clear any cache attributes we may have previously set
6251 			 * in the PTE template. */
6252 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6253 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6254 
6255 #if XNU_MONITOR
6256 			/* The regular old kernel is not allowed to remap PPL pages. */
6257 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6258 				panic("%s: page belongs to PPL, "
6259 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6260 				    __FUNCTION__,
6261 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6262 			}
6263 
6264 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6265 				panic("%s: page locked down, "
6266 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6267 				    __FUNCTION__,
6268 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6269 			}
6270 #endif
6271 
6272 
6273 
6274 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6275 			if (!committed) {
6276 				pvh_unlock(pai);
6277 				continue;
6278 			}
6279 			had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6280 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6281 
6282 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6283 			/*
6284 			 * If there was already a valid pte here then we reuse its reference
6285 			 * on the ptd and drop the one that we took above.
6286 			 */
6287 			drop_refcnt = had_valid_mapping;
6288 
6289 			if (!had_valid_mapping) {
6290 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6291 				int pve_ptep_idx = 0;
6292 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6293 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6294 				if (pv_status != PV_ALLOC_SUCCESS) {
6295 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6296 					    __func__, pv_status, new_pve_p, pmap);
6297 				}
6298 
6299 				if (pmap != kernel_pmap) {
6300 					if (options & PMAP_OPTIONS_INTERNAL) {
6301 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6302 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6303 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6304 							/*
6305 							 * Make a note to ourselves that this
6306 							 * mapping is using alternative
6307 							 * accounting. We'll need this in order
6308 							 * to know which ledger to debit when
6309 							 * the mapping is removed.
6310 							 *
6311 							 * The altacct bit must be set while
6312 							 * the pv head is locked. Defer the
6313 							 * ledger accounting until after we've
6314 							 * dropped the lock.
6315 							 */
6316 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6317 							is_altacct = TRUE;
6318 						}
6319 					}
6320 					if (ppattr_test_reusable(pai) &&
6321 					    !is_altacct) {
6322 						is_reusable = TRUE;
6323 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6324 						is_internal = TRUE;
6325 					} else {
6326 						is_external = TRUE;
6327 					}
6328 				}
6329 			}
6330 
6331 			pvh_unlock(pai);
6332 
6333 			if (pp_attr_bits != 0) {
6334 				ppattr_pa_set_bits(pa, pp_attr_bits);
6335 			}
6336 
6337 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6338 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6339 
6340 				if (is_internal) {
6341 					/*
6342 					 * Make corresponding adjustments to
6343 					 * phys_footprint statistics.
6344 					 */
6345 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6346 					if (is_altacct) {
6347 						/*
6348 						 * If this page is internal and
6349 						 * in an IOKit region, credit
6350 						 * the task's total count of
6351 						 * dirty, internal IOKit pages.
6352 						 * It should *not* count towards
6353 						 * the task's total physical
6354 						 * memory footprint, because
6355 						 * this entire region was
6356 						 * already billed to the task
6357 						 * at the time the mapping was
6358 						 * created.
6359 						 *
6360 						 * Put another way, this is
6361 						 * internal++ and
6362 						 * alternate_accounting++, so
6363 						 * net effect on phys_footprint
6364 						 * is 0. That means: don't
6365 						 * touch phys_footprint here.
6366 						 */
6367 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6368 					} else {
6369 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6370 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6371 							skip_footprint_debit = true;
6372 						} else {
6373 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6374 						}
6375 					}
6376 				}
6377 				if (is_reusable) {
6378 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6379 				} else if (is_external) {
6380 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6381 				}
6382 			}
6383 		} else {
6384 			if (prot & VM_PROT_EXECUTE) {
6385 				kr = KERN_FAILURE;
6386 				break;
6387 			}
6388 
6389 			wimg_bits = pmap_cache_attributes(pn);
6390 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6391 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6392 			}
6393 
6394 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6395 
6396 #if XNU_MONITOR
6397 			pte = pmap_construct_io_pte(pa, pte);
6398 
6399 			/**
6400 			 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6401 			 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6402 			 * created and later removed.  We must therefore prevent an attacker from downgrading a
6403 			 * a writable mapping in order to allow it to be removed and remapped to something else.
6404 			 */
6405 			if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6406 			    ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6407 			    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6408 			    (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6409 				panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6410 				    __func__, (uint64_t)pte_to_pa(spte));
6411 			}
6412 #endif
6413 
6414 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6415 			if (committed) {
6416 				had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6417 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6418 
6419 				/**
6420 				 * If there was already a valid pte here then we reuse its
6421 				 * reference on the ptd and drop the one that we took above.
6422 				 */
6423 				drop_refcnt = had_valid_mapping;
6424 			}
6425 		}
6426 		if (committed) {
6427 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6428 				assert(pmap != kernel_pmap);
6429 
6430 				/* One less "compressed" */
6431 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6432 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6433 
6434 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6435 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6436 				} else if (!skip_footprint_debit) {
6437 					/* Was part of the footprint */
6438 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6439 				}
6440 				/* The old entry held a reference so drop the extra one that we took above. */
6441 				drop_refcnt = true;
6442 			}
6443 		}
6444 	}
6445 
6446 	if (drop_refcnt && refcnt != NULL) {
6447 		assert(refcnt_updated);
6448 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6449 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6450 		}
6451 	}
6452 
6453 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6454 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6455 	}
6456 
6457 	pmap_unlock(pmap, lock_mode);
6458 
6459 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6460 		pmap_phys_write_disable(v);
6461 	}
6462 
6463 	return kr;
6464 }
6465 
6466 kern_return_t
6467 pmap_enter_options_addr(
6468 	pmap_t pmap,
6469 	vm_map_address_t v,
6470 	pmap_paddr_t pa,
6471 	vm_prot_t prot,
6472 	vm_prot_t fault_type,
6473 	unsigned int flags,
6474 	boolean_t wired,
6475 	unsigned int options,
6476 	__unused void   *arg,
6477 	__unused pmap_mapping_type_t mapping_type)
6478 {
6479 	kern_return_t kr = KERN_FAILURE;
6480 
6481 
6482 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6483 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6484 
6485 
6486 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6487 	do {
6488 #if XNU_MONITOR
6489 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6490 #else
6491 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6492 #endif
6493 
6494 		if (kr == KERN_RESOURCE_SHORTAGE) {
6495 #if XNU_MONITOR
6496 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6497 #endif
6498 			if (nowait_requested) {
6499 				break;
6500 			}
6501 		}
6502 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6503 
6504 #if XNU_MONITOR
6505 	pmap_ledger_check_balance(pmap);
6506 #endif
6507 
6508 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6509 
6510 	return kr;
6511 }
6512 
6513 kern_return_t
6514 pmap_enter_options(
6515 	pmap_t pmap,
6516 	vm_map_address_t v,
6517 	ppnum_t pn,
6518 	vm_prot_t prot,
6519 	vm_prot_t fault_type,
6520 	unsigned int flags,
6521 	boolean_t wired,
6522 	unsigned int options,
6523 	__unused void   *arg,
6524 	pmap_mapping_type_t mapping_type)
6525 {
6526 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6527 }
6528 
6529 /*
6530  *	Routine:	pmap_change_wiring
6531  *	Function:	Change the wiring attribute for a map/virtual-address
6532  *			pair.
6533  *	In/out conditions:
6534  *			The mapping must already exist in the pmap.
6535  */
6536 MARK_AS_PMAP_TEXT kern_return_t
6537 pmap_change_wiring_internal(
6538 	pmap_t pmap,
6539 	vm_map_address_t v,
6540 	boolean_t wired)
6541 {
6542 	pt_entry_t     *pte_p;
6543 	pmap_paddr_t    pa;
6544 
6545 	validate_pmap_mutable(pmap);
6546 
6547 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6548 		return KERN_ABORTED;
6549 	}
6550 
6551 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6552 
6553 	pte_p = pmap_pte(pmap, v);
6554 	if (pte_p == PT_ENTRY_NULL) {
6555 		if (!wired) {
6556 			/*
6557 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6558 			 * may have been freed by a remove operation.
6559 			 */
6560 			goto pmap_change_wiring_return;
6561 		} else {
6562 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6563 		}
6564 	}
6565 	/*
6566 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6567 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6568 	 */
6569 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6570 
6571 	while (pa_valid(pa)) {
6572 		pmap_paddr_t new_pa;
6573 
6574 		pvh_lock(pa_index(pa));
6575 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6576 
6577 		if (pa == new_pa) {
6578 			break;
6579 		}
6580 
6581 		pvh_unlock(pa_index(pa));
6582 		pa = new_pa;
6583 	}
6584 
6585 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6586 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6587 		if (!wired) {
6588 			/* PTE cleared by prior remove/disconnect operation */
6589 			goto pmap_change_wiring_cleanup;
6590 		} else {
6591 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6592 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6593 		}
6594 	}
6595 
6596 	assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6597 	if (wired != pte_is_wired(*pte_p)) {
6598 		pte_set_wired(pmap, pte_p, wired);
6599 		if (pmap != kernel_pmap) {
6600 			if (wired) {
6601 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6602 			} else if (!wired) {
6603 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6604 			}
6605 		}
6606 	}
6607 
6608 pmap_change_wiring_cleanup:
6609 	if (pa_valid(pa)) {
6610 		pvh_unlock(pa_index(pa));
6611 	}
6612 
6613 pmap_change_wiring_return:
6614 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6615 
6616 	return KERN_SUCCESS;
6617 }
6618 
6619 void
6620 pmap_change_wiring(
6621 	pmap_t pmap,
6622 	vm_map_address_t v,
6623 	boolean_t wired)
6624 {
6625 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6626 	pmap_verify_preemptible();
6627 
6628 	kern_return_t kr = KERN_FAILURE;
6629 #if XNU_MONITOR
6630 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6631 	do {
6632 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6633 	} while (kr == KERN_ABORTED);
6634 
6635 	pmap_ledger_check_balance(pmap);
6636 #else
6637 	/* Since we verified preemptibility, call the helper only once. */
6638 	kr = pmap_change_wiring_internal(pmap, v, wired);
6639 #endif
6640 
6641 	if (kr != KERN_SUCCESS) {
6642 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6643 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6644 	}
6645 }
6646 
6647 MARK_AS_PMAP_TEXT pmap_paddr_t
6648 pmap_find_pa_internal(
6649 	pmap_t pmap,
6650 	addr64_t va)
6651 {
6652 	pmap_paddr_t    pa = 0;
6653 
6654 	validate_pmap(pmap);
6655 
6656 	if (pmap != kernel_pmap) {
6657 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6658 	}
6659 
6660 	pa = pmap_vtophys(pmap, va);
6661 
6662 	if (pmap != kernel_pmap) {
6663 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6664 	}
6665 
6666 	return pa;
6667 }
6668 
6669 pmap_paddr_t
6670 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6671 {
6672 	pmap_paddr_t pa = 0;
6673 
6674 	if (pmap == kernel_pmap) {
6675 		pa = mmu_kvtop(va);
6676 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6677 		/*
6678 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6679 		 * translation even if PAN would prevent kernel access through the translation.
6680 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6681 		 */
6682 		pa = mmu_uvtop(va);
6683 	}
6684 	return pa;
6685 }
6686 
6687 pmap_paddr_t
6688 pmap_find_pa(
6689 	pmap_t pmap,
6690 	addr64_t va)
6691 {
6692 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6693 
6694 	if (pa != 0) {
6695 		return pa;
6696 	}
6697 
6698 	if (not_in_kdp) {
6699 #if XNU_MONITOR
6700 		return pmap_find_pa_ppl(pmap, va);
6701 #else
6702 		return pmap_find_pa_internal(pmap, va);
6703 #endif
6704 	} else {
6705 		return pmap_vtophys(pmap, va);
6706 	}
6707 }
6708 
6709 ppnum_t
6710 pmap_find_phys_nofault(
6711 	pmap_t pmap,
6712 	addr64_t va)
6713 {
6714 	ppnum_t ppn;
6715 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6716 	return ppn;
6717 }
6718 
6719 ppnum_t
6720 pmap_find_phys(
6721 	pmap_t pmap,
6722 	addr64_t va)
6723 {
6724 	ppnum_t ppn;
6725 	ppn = atop(pmap_find_pa(pmap, va));
6726 	return ppn;
6727 }
6728 
6729 /**
6730  * Translate a kernel virtual address into a physical address.
6731  *
6732  * @param va The kernel virtual address to translate. Does not work on user
6733  *           virtual addresses.
6734  *
6735  * @return The physical address if the translation was successful, or zero if
6736  *         no valid mappings were found for the given virtual address.
6737  */
6738 pmap_paddr_t
6739 kvtophys(vm_offset_t va)
6740 {
6741 	/**
6742 	 * Attempt to do the translation first in hardware using the AT (address
6743 	 * translation) instruction. This will attempt to use the MMU to do the
6744 	 * translation for us.
6745 	 */
6746 	pmap_paddr_t pa = mmu_kvtop(va);
6747 
6748 	if (pa) {
6749 		return pa;
6750 	}
6751 
6752 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6753 	return pmap_vtophys(kernel_pmap, va);
6754 }
6755 
6756 /**
6757  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6758  * points to a non-kernel-managed physical page, then this call will panic().
6759  *
6760  * @note The output of this function is guaranteed to be a kernel-managed
6761  *       physical page, which means it's safe to pass the output directly to
6762  *       pa_index() to create a physical address index for various pmap data
6763  *       structures.
6764  *
6765  * @param va The kernel virtual address to translate. Does not work on user
6766  *           virtual addresses.
6767  *
6768  * @return The translated physical address for the given virtual address.
6769  */
6770 pmap_paddr_t
6771 kvtophys_nofail(vm_offset_t va)
6772 {
6773 	pmap_paddr_t pa = kvtophys(va);
6774 
6775 	if (!pa_valid(pa)) {
6776 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6777 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6778 	}
6779 
6780 	return pa;
6781 }
6782 
6783 pmap_paddr_t
6784 pmap_vtophys(
6785 	pmap_t pmap,
6786 	addr64_t va)
6787 {
6788 	if ((va < pmap->min) || (va >= pmap->max)) {
6789 		return 0;
6790 	}
6791 
6792 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6793 
6794 	tt_entry_t * ttp = NULL;
6795 	tt_entry_t * ttep = NULL;
6796 	tt_entry_t   tte = ARM_TTE_EMPTY;
6797 	pmap_paddr_t pa = 0;
6798 	unsigned int cur_level;
6799 
6800 	ttp = pmap->tte;
6801 
6802 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6803 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6804 
6805 		tte = *ttep;
6806 
6807 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6808 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6809 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6810 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6811 
6812 		if ((tte & valid_mask) != valid_mask) {
6813 			return (pmap_paddr_t) 0;
6814 		}
6815 
6816 		/* This detects both leaf entries and intermediate block mappings. */
6817 		if ((tte & type_mask) == type_block) {
6818 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6819 			break;
6820 		}
6821 
6822 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6823 	}
6824 
6825 	return pa;
6826 }
6827 
6828 /*
6829  *	pmap_init_pte_page - Initialize a page table page.
6830  */
6831 MARK_AS_PMAP_TEXT void
6832 pmap_init_pte_page(
6833 	pmap_t pmap,
6834 	pt_entry_t *pte_p,
6835 	vm_offset_t va,
6836 	unsigned int ttlevel,
6837 	boolean_t alloc_ptd)
6838 {
6839 	pt_desc_t   *ptdp = NULL;
6840 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6841 
6842 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6843 		if (alloc_ptd) {
6844 			/*
6845 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6846 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6847 			 * bootstrap request, so we check for an existing PTD here.
6848 			 */
6849 			ptdp = ptd_alloc(pmap);
6850 			if (ptdp == NULL) {
6851 				panic("%s: unable to allocate PTD", __func__);
6852 			}
6853 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6854 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6855 			pvh_set_flags(pvh, 0);
6856 		} else {
6857 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
6858 		}
6859 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6860 		ptdp = pvh_ptd(pvh);
6861 	} else {
6862 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6863 	}
6864 
6865 	// below barrier ensures previous updates to the page are visible to PTW before
6866 	// it is linked to the PTE of previous level
6867 	__builtin_arm_dmb(DMB_ISHST);
6868 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6869 }
6870 
6871 /*
6872  *	Routine:	pmap_expand
6873  *
6874  *	Expands a pmap to be able to map the specified virtual address.
6875  *
6876  *	Allocates new memory for the default (COARSE) translation table
6877  *	entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6878  *	also allocates space for the corresponding pv entries.
6879  *
6880  *	Nothing should be locked.
6881  */
6882 MARK_AS_PMAP_TEXT static kern_return_t
6883 pmap_expand(
6884 	pmap_t pmap,
6885 	vm_map_address_t v,
6886 	unsigned int options,
6887 	unsigned int level)
6888 {
6889 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6890 
6891 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6892 		return KERN_INVALID_ADDRESS;
6893 	}
6894 	pmap_paddr_t    pa;
6895 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
6896 	tt_entry_t              *tte_p;
6897 	tt_entry_t              *tt_p;
6898 
6899 	pa = 0x0ULL;
6900 	tt_p =  (tt_entry_t *)NULL;
6901 
6902 	for (; ttlevel < level; ttlevel++) {
6903 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6904 			return KERN_ABORTED;
6905 		}
6906 
6907 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6908 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6909 			kern_return_t ret;
6910 			while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
6911 				if (options & PMAP_OPTIONS_NOWAIT) {
6912 					/* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
6913 					return ret;
6914 				}
6915 #if XNU_MONITOR
6916 				panic("%s: failed to allocate tt, "
6917 				    "pmap=%p, v=%p, options=0x%x, level=%u",
6918 				    __FUNCTION__,
6919 				    pmap, (void *)v, options, level);
6920 #else
6921 				VM_PAGE_WAIT();
6922 #endif
6923 			}
6924 
6925 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6926 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6927 				return KERN_ABORTED;
6928 			}
6929 
6930 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6931 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6932 				pa = kvtophys_nofail((vm_offset_t)tt_p);
6933 				tte_p = pmap_ttne(pmap, ttlevel, v);
6934 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6935 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6936 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6937 				pa = 0x0ULL;
6938 				tt_p = (tt_entry_t *)NULL;
6939 			}
6940 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6941 		} else {
6942 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6943 		}
6944 
6945 		if (tt_p != (tt_entry_t *)NULL) {
6946 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6947 			tt_p = (tt_entry_t *)NULL;
6948 		}
6949 	}
6950 
6951 	return KERN_SUCCESS;
6952 }
6953 
6954 /*
6955  *	Routine:	pmap_gc
6956  *	Function:
6957  *              Pmap garbage collection
6958  *		Called by the pageout daemon when pages are scarce.
6959  *
6960  */
6961 void
6962 pmap_gc(void)
6963 {
6964 	/*
6965 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6966 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6967 	 * or may contain wired mappings.  However, with the relatively recent change to
6968 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6969 	 * page, it may make sense to call that function here.
6970 	 */
6971 }
6972 
6973 /*
6974  *      By default, don't attempt pmap GC more frequently
6975  *      than once / 1 minutes.
6976  */
6977 
6978 void
6979 compute_pmap_gc_throttle(
6980 	void *arg __unused)
6981 {
6982 }
6983 
6984 /*
6985  * pmap_attribute_cache_sync(vm_offset_t pa)
6986  *
6987  * Invalidates all of the instruction cache on a physical page and
6988  * pushes any dirty data from the data cache for the same physical page
6989  */
6990 
6991 kern_return_t
6992 pmap_attribute_cache_sync(
6993 	ppnum_t pp,
6994 	vm_size_t size,
6995 	__unused vm_machine_attribute_t attribute,
6996 	__unused vm_machine_attribute_val_t * value)
6997 {
6998 	if (size > PAGE_SIZE) {
6999 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7000 	} else {
7001 		cache_sync_page(pp);
7002 	}
7003 
7004 	return KERN_SUCCESS;
7005 }
7006 
7007 /*
7008  * pmap_sync_page_data_phys(ppnum_t pp)
7009  *
7010  * Invalidates all of the instruction cache on a physical page and
7011  * pushes any dirty data from the data cache for the same physical page
7012  */
7013 void
7014 pmap_sync_page_data_phys(
7015 	ppnum_t pp)
7016 {
7017 	cache_sync_page(pp);
7018 }
7019 
7020 /*
7021  * pmap_sync_page_attributes_phys(ppnum_t pp)
7022  *
7023  * Write back and invalidate all cachelines on a physical page.
7024  */
7025 void
7026 pmap_sync_page_attributes_phys(
7027 	ppnum_t pp)
7028 {
7029 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7030 }
7031 
7032 #if CONFIG_COREDUMP
7033 /* temporary workaround */
7034 boolean_t
7035 coredumpok(
7036 	vm_map_t map,
7037 	mach_vm_offset_t va)
7038 {
7039 	pt_entry_t     *pte_p;
7040 	pt_entry_t      spte;
7041 
7042 	pte_p = pmap_pte(map->pmap, va);
7043 	if (0 == pte_p) {
7044 		return FALSE;
7045 	}
7046 	if (vm_map_entry_has_device_pager(map, va)) {
7047 		return FALSE;
7048 	}
7049 	spte = *pte_p;
7050 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7051 }
7052 #endif
7053 
7054 void
7055 fillPage(
7056 	ppnum_t pn,
7057 	unsigned int fill)
7058 {
7059 	unsigned int   *addr;
7060 	int             count;
7061 
7062 	addr = (unsigned int *) phystokv(ptoa(pn));
7063 	count = PAGE_SIZE / sizeof(unsigned int);
7064 	while (count--) {
7065 		*addr++ = fill;
7066 	}
7067 }
7068 
7069 extern void     mapping_set_mod(ppnum_t pn);
7070 
7071 void
7072 mapping_set_mod(
7073 	ppnum_t pn)
7074 {
7075 	pmap_set_modify(pn);
7076 }
7077 
7078 extern void     mapping_set_ref(ppnum_t pn);
7079 
7080 void
7081 mapping_set_ref(
7082 	ppnum_t pn)
7083 {
7084 	pmap_set_reference(pn);
7085 }
7086 
7087 /*
7088  * Clear specified attribute bits.
7089  *
7090  * Try to force an arm_fast_fault() for all mappings of
7091  * the page - to force attributes to be set again at fault time.
7092  * If the forcing succeeds, clear the cached bits at the head.
7093  * Otherwise, something must have been wired, so leave the cached
7094  * attributes alone.
7095  */
7096 MARK_AS_PMAP_TEXT static void
7097 phys_attribute_clear_with_flush_range(
7098 	ppnum_t         pn,
7099 	unsigned int    bits,
7100 	int             options,
7101 	void            *arg,
7102 	pmap_tlb_flush_range_t *flush_range)
7103 {
7104 	pmap_paddr_t    pa = ptoa(pn);
7105 	vm_prot_t       allow_mode = VM_PROT_ALL;
7106 
7107 #if XNU_MONITOR
7108 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7109 		panic("%s: illegal request, "
7110 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7111 		    __FUNCTION__,
7112 		    pn, bits, options, arg, flush_range);
7113 	}
7114 #endif
7115 	if ((arg != NULL) || (flush_range != NULL)) {
7116 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7117 	}
7118 
7119 	if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7120 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7121 		    "invalid options",
7122 		    pn, bits, options, arg, flush_range);
7123 	}
7124 
7125 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7126 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7127 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7128 		    "should not clear 'modified' without flushing TLBs",
7129 		    pn, bits, options, arg, flush_range);
7130 	}
7131 
7132 	assert(pn != vm_page_fictitious_addr);
7133 
7134 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7135 		assert(bits == PP_ATTR_MODIFIED);
7136 
7137 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7138 		/*
7139 		 * We short circuit this case; it should not need to
7140 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7141 		 * pmap_page_protect has taken care of resetting
7142 		 * the state so that we'll see the next write as a fault to
7143 		 * the VM (i.e. we don't want a fast fault).
7144 		 */
7145 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7146 		return;
7147 	}
7148 	if (bits & PP_ATTR_REFERENCED) {
7149 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7150 	}
7151 	if (bits & PP_ATTR_MODIFIED) {
7152 		allow_mode &= ~VM_PROT_WRITE;
7153 	}
7154 
7155 	if (bits == PP_ATTR_NOENCRYPT) {
7156 		/*
7157 		 * We short circuit this case; it should not need to
7158 		 * invoke arm_force_fast_fault, so just clear and
7159 		 * return.  On ARM, this bit is just a debugging aid.
7160 		 */
7161 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7162 		return;
7163 	}
7164 
7165 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7166 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7167 	}
7168 }
7169 
7170 MARK_AS_PMAP_TEXT void
7171 phys_attribute_clear_internal(
7172 	ppnum_t         pn,
7173 	unsigned int    bits,
7174 	int             options,
7175 	void            *arg)
7176 {
7177 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7178 }
7179 
7180 #if __ARM_RANGE_TLBI__
7181 MARK_AS_PMAP_TEXT static vm_map_address_t
7182 phys_attribute_clear_twig_internal(
7183 	pmap_t pmap,
7184 	vm_map_address_t start,
7185 	vm_map_address_t end,
7186 	unsigned int bits,
7187 	unsigned int options,
7188 	pmap_tlb_flush_range_t *flush_range)
7189 {
7190 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7191 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7192 	assert(end >= start);
7193 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7194 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7195 	vm_map_address_t va = start;
7196 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7197 	tt_entry_t     *tte_p;
7198 	tte_p = pmap_tte(pmap, start);
7199 	unsigned int npages = 0;
7200 
7201 	if (tte_p == (tt_entry_t *) NULL) {
7202 		return end;
7203 	}
7204 
7205 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7206 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7207 
7208 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7209 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7210 		assert(end_pte_p >= start_pte_p);
7211 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7212 			if (__improbable(npages++ && pmap_pending_preemption())) {
7213 				return va;
7214 			}
7215 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7216 			if (pa_valid(pa)) {
7217 				ppnum_t pn = (ppnum_t) atop(pa);
7218 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7219 			}
7220 		}
7221 	}
7222 	return end;
7223 }
7224 
7225 MARK_AS_PMAP_TEXT vm_map_address_t
7226 phys_attribute_clear_range_internal(
7227 	pmap_t pmap,
7228 	vm_map_address_t start,
7229 	vm_map_address_t end,
7230 	unsigned int bits,
7231 	unsigned int options)
7232 {
7233 	if (__improbable(end < start)) {
7234 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7235 	}
7236 	validate_pmap_mutable(pmap);
7237 
7238 	vm_map_address_t va = start;
7239 	pmap_tlb_flush_range_t flush_range = {
7240 		.ptfr_pmap = pmap,
7241 		.ptfr_start = start,
7242 		.ptfr_end = end,
7243 		.ptfr_flush_needed = false
7244 	};
7245 
7246 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7247 		return va;
7248 	}
7249 
7250 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7251 
7252 	while (va < end) {
7253 		vm_map_address_t curr_end;
7254 
7255 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7256 		if (curr_end > end) {
7257 			curr_end = end;
7258 		}
7259 
7260 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7261 		if ((va < curr_end) || pmap_pending_preemption()) {
7262 			break;
7263 		}
7264 	}
7265 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7266 	if (flush_range.ptfr_flush_needed) {
7267 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7268 			flush_range.ptfr_start,
7269 			flush_range.ptfr_end - flush_range.ptfr_start,
7270 			flush_range.ptfr_pmap,
7271 			true,
7272 			false);
7273 		sync_tlb_flush();
7274 	}
7275 	return va;
7276 }
7277 
7278 static void
7279 phys_attribute_clear_range(
7280 	pmap_t pmap,
7281 	vm_map_address_t start,
7282 	vm_map_address_t end,
7283 	unsigned int bits,
7284 	unsigned int options)
7285 {
7286 	/*
7287 	 * We allow single-page requests to execute non-preemptibly,
7288 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7289 	 * operation, and there are a couple of special use cases that
7290 	 * require a non-preemptible single-page operation.
7291 	 */
7292 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7293 		pmap_verify_preemptible();
7294 	}
7295 
7296 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7297 
7298 	while (start < end) {
7299 #if XNU_MONITOR
7300 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7301 #else
7302 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7303 #endif
7304 	}
7305 
7306 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7307 }
7308 #endif /* __ARM_RANGE_TLBI__ */
7309 
7310 static void
7311 phys_attribute_clear(
7312 	ppnum_t         pn,
7313 	unsigned int    bits,
7314 	int             options,
7315 	void            *arg)
7316 {
7317 	/*
7318 	 * Do we really want this tracepoint?  It will be extremely chatty.
7319 	 * Also, should we have a corresponding trace point for the set path?
7320 	 */
7321 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7322 
7323 #if XNU_MONITOR
7324 	phys_attribute_clear_ppl(pn, bits, options, arg);
7325 #else
7326 	phys_attribute_clear_internal(pn, bits, options, arg);
7327 #endif
7328 
7329 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7330 }
7331 
7332 /*
7333  *	Set specified attribute bits.
7334  *
7335  *	Set cached value in the pv head because we have
7336  *	no per-mapping hardware support for referenced and
7337  *	modify bits.
7338  */
7339 MARK_AS_PMAP_TEXT void
7340 phys_attribute_set_internal(
7341 	ppnum_t pn,
7342 	unsigned int bits)
7343 {
7344 	pmap_paddr_t    pa = ptoa(pn);
7345 	assert(pn != vm_page_fictitious_addr);
7346 
7347 #if XNU_MONITOR
7348 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7349 		panic("%s: illegal request, "
7350 		    "pn=%u, bits=%#x",
7351 		    __FUNCTION__,
7352 		    pn, bits);
7353 	}
7354 #endif
7355 
7356 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7357 
7358 	return;
7359 }
7360 
7361 static void
7362 phys_attribute_set(
7363 	ppnum_t pn,
7364 	unsigned int bits)
7365 {
7366 #if XNU_MONITOR
7367 	phys_attribute_set_ppl(pn, bits);
7368 #else
7369 	phys_attribute_set_internal(pn, bits);
7370 #endif
7371 }
7372 
7373 
7374 /*
7375  *	Check specified attribute bits.
7376  *
7377  *	use the software cached bits (since no hw support).
7378  */
7379 static boolean_t
7380 phys_attribute_test(
7381 	ppnum_t pn,
7382 	unsigned int bits)
7383 {
7384 	pmap_paddr_t    pa = ptoa(pn);
7385 	assert(pn != vm_page_fictitious_addr);
7386 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7387 }
7388 
7389 
7390 /*
7391  *	Set the modify/reference bits on the specified physical page.
7392  */
7393 void
7394 pmap_set_modify(ppnum_t pn)
7395 {
7396 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7397 }
7398 
7399 
7400 /*
7401  *	Clear the modify bits on the specified physical page.
7402  */
7403 void
7404 pmap_clear_modify(
7405 	ppnum_t pn)
7406 {
7407 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7408 }
7409 
7410 
7411 /*
7412  *	pmap_is_modified:
7413  *
7414  *	Return whether or not the specified physical page is modified
7415  *	by any physical maps.
7416  */
7417 boolean_t
7418 pmap_is_modified(
7419 	ppnum_t pn)
7420 {
7421 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7422 }
7423 
7424 
7425 /*
7426  *	Set the reference bit on the specified physical page.
7427  */
7428 static void
7429 pmap_set_reference(
7430 	ppnum_t pn)
7431 {
7432 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7433 }
7434 
7435 /*
7436  *	Clear the reference bits on the specified physical page.
7437  */
7438 void
7439 pmap_clear_reference(
7440 	ppnum_t pn)
7441 {
7442 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7443 }
7444 
7445 
7446 /*
7447  *	pmap_is_referenced:
7448  *
7449  *	Return whether or not the specified physical page is referenced
7450  *	by any physical maps.
7451  */
7452 boolean_t
7453 pmap_is_referenced(
7454 	ppnum_t pn)
7455 {
7456 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7457 }
7458 
7459 /*
7460  * pmap_get_refmod(phys)
7461  *  returns the referenced and modified bits of the specified
7462  *  physical page.
7463  */
7464 unsigned int
7465 pmap_get_refmod(
7466 	ppnum_t pn)
7467 {
7468 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7469 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7470 }
7471 
7472 static inline unsigned int
7473 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7474 {
7475 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7476 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7477 }
7478 
7479 /*
7480  * pmap_clear_refmod(phys, mask)
7481  *  clears the referenced and modified bits as specified by the mask
7482  *  of the specified physical page.
7483  */
7484 void
7485 pmap_clear_refmod_options(
7486 	ppnum_t         pn,
7487 	unsigned int    mask,
7488 	unsigned int    options,
7489 	void            *arg)
7490 {
7491 	unsigned int    bits;
7492 
7493 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7494 	phys_attribute_clear(pn, bits, options, arg);
7495 }
7496 
7497 /*
7498  * Perform pmap_clear_refmod_options on a virtual address range.
7499  * The operation will be performed in bulk & tlb flushes will be coalesced
7500  * if possible.
7501  *
7502  * Returns true if the operation is supported on this platform.
7503  * If this function returns false, the operation is not supported and
7504  * nothing has been modified in the pmap.
7505  */
7506 bool
7507 pmap_clear_refmod_range_options(
7508 	pmap_t pmap __unused,
7509 	vm_map_address_t start __unused,
7510 	vm_map_address_t end __unused,
7511 	unsigned int mask __unused,
7512 	unsigned int options __unused)
7513 {
7514 #if __ARM_RANGE_TLBI__
7515 	unsigned int    bits;
7516 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7517 	phys_attribute_clear_range(pmap, start, end, bits, options);
7518 	return true;
7519 #else /* __ARM_RANGE_TLBI__ */
7520 #pragma unused(pmap, start, end, mask, options)
7521 	/*
7522 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7523 	 * contiguous range of addresses. This is large performance improvement on
7524 	 * platforms that support ranged tlbi instructions. But on older platforms,
7525 	 * we can only flush per-page or the entire asid. So we currently
7526 	 * only support this operation on platforms that support ranged tlbi.
7527 	 * instructions. On other platforms, we require that
7528 	 * the VM modify the bits on a per-page basis.
7529 	 */
7530 	return false;
7531 #endif /* __ARM_RANGE_TLBI__ */
7532 }
7533 
7534 void
7535 pmap_clear_refmod(
7536 	ppnum_t pn,
7537 	unsigned int mask)
7538 {
7539 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7540 }
7541 
7542 unsigned int
7543 pmap_disconnect_options(
7544 	ppnum_t pn,
7545 	unsigned int options,
7546 	void *arg)
7547 {
7548 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7549 		/*
7550 		 * On ARM, the "modified" bit is managed by software, so
7551 		 * we know up-front if the physical page is "modified",
7552 		 * without having to scan all the PTEs pointing to it.
7553 		 * The caller should have made the VM page "busy" so noone
7554 		 * should be able to establish any new mapping and "modify"
7555 		 * the page behind us.
7556 		 */
7557 		if (pmap_is_modified(pn)) {
7558 			/*
7559 			 * The page has been modified and will be sent to
7560 			 * the VM compressor.
7561 			 */
7562 			options |= PMAP_OPTIONS_COMPRESSOR;
7563 		} else {
7564 			/*
7565 			 * The page hasn't been modified and will be freed
7566 			 * instead of compressed.
7567 			 */
7568 		}
7569 	}
7570 
7571 	/* disconnect the page */
7572 	pmap_page_protect_options(pn, 0, options, arg);
7573 
7574 	/* return ref/chg status */
7575 	return pmap_get_refmod(pn);
7576 }
7577 
7578 /*
7579  *	Routine:
7580  *		pmap_disconnect
7581  *
7582  *	Function:
7583  *		Disconnect all mappings for this page and return reference and change status
7584  *		in generic format.
7585  *
7586  */
7587 unsigned int
7588 pmap_disconnect(
7589 	ppnum_t pn)
7590 {
7591 	pmap_page_protect(pn, 0);       /* disconnect the page */
7592 	return pmap_get_refmod(pn);   /* return ref/chg status */
7593 }
7594 
7595 boolean_t
7596 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7597 {
7598 	if (ptoa(first) >= vm_last_phys) {
7599 		return FALSE;
7600 	}
7601 	if (ptoa(last) < vm_first_phys) {
7602 		return FALSE;
7603 	}
7604 
7605 	return TRUE;
7606 }
7607 
7608 /*
7609  * The state maintained by the noencrypt functions is used as a
7610  * debugging aid on ARM.  This incurs some overhead on the part
7611  * of the caller.  A special case check in phys_attribute_clear
7612  * (the most expensive path) currently minimizes this overhead,
7613  * but stubbing these functions out on RELEASE kernels yields
7614  * further wins.
7615  */
7616 boolean_t
7617 pmap_is_noencrypt(
7618 	ppnum_t pn)
7619 {
7620 #if DEVELOPMENT || DEBUG
7621 	boolean_t result = FALSE;
7622 
7623 	if (!pa_valid(ptoa(pn))) {
7624 		return FALSE;
7625 	}
7626 
7627 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7628 
7629 	return result;
7630 #else
7631 #pragma unused(pn)
7632 	return FALSE;
7633 #endif
7634 }
7635 
7636 void
7637 pmap_set_noencrypt(
7638 	ppnum_t pn)
7639 {
7640 #if DEVELOPMENT || DEBUG
7641 	if (!pa_valid(ptoa(pn))) {
7642 		return;
7643 	}
7644 
7645 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7646 #else
7647 #pragma unused(pn)
7648 #endif
7649 }
7650 
7651 void
7652 pmap_clear_noencrypt(
7653 	ppnum_t pn)
7654 {
7655 #if DEVELOPMENT || DEBUG
7656 	if (!pa_valid(ptoa(pn))) {
7657 		return;
7658 	}
7659 
7660 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7661 #else
7662 #pragma unused(pn)
7663 #endif
7664 }
7665 
7666 #if XNU_MONITOR
7667 boolean_t
7668 pmap_is_monitor(ppnum_t pn)
7669 {
7670 	assert(pa_valid(ptoa(pn)));
7671 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7672 }
7673 #endif
7674 
7675 void
7676 pmap_lock_phys_page(ppnum_t pn)
7677 {
7678 #if !XNU_MONITOR
7679 	unsigned int    pai;
7680 	pmap_paddr_t    phys = ptoa(pn);
7681 
7682 	if (pa_valid(phys)) {
7683 		pai = pa_index(phys);
7684 		pvh_lock(pai);
7685 	} else
7686 #else
7687 	(void)pn;
7688 #endif
7689 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7690 }
7691 
7692 
7693 void
7694 pmap_unlock_phys_page(ppnum_t pn)
7695 {
7696 #if !XNU_MONITOR
7697 	unsigned int    pai;
7698 	pmap_paddr_t    phys = ptoa(pn);
7699 
7700 	if (pa_valid(phys)) {
7701 		pai = pa_index(phys);
7702 		pvh_unlock(pai);
7703 	} else
7704 #else
7705 	(void)pn;
7706 #endif
7707 	{ simple_unlock(&phys_backup_lock);}
7708 }
7709 
7710 MARK_AS_PMAP_TEXT static void
7711 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7712 {
7713 	if (pmap != kernel_pmap) {
7714 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7715 		cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7716 		    NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7717 		cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7718 		cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7719 #if __ARM_MIXED_PAGE_SIZE__
7720 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7721 #endif
7722 	}
7723 
7724 
7725 #if __ARM_MIXED_PAGE_SIZE__
7726 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7727 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7728 	}
7729 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7730 
7731 
7732 	if (pmap != kernel_pmap) {
7733 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7734 	} else if (!pmap_user_ttb_is_clear()) {
7735 		pmap_clear_user_ttb_internal();
7736 	}
7737 }
7738 
7739 MARK_AS_PMAP_TEXT void
7740 pmap_clear_user_ttb_internal(void)
7741 {
7742 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7743 }
7744 
7745 void
7746 pmap_clear_user_ttb(void)
7747 {
7748 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7749 #if XNU_MONITOR
7750 	pmap_clear_user_ttb_ppl();
7751 #else
7752 	pmap_clear_user_ttb_internal();
7753 #endif
7754 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7755 }
7756 
7757 
7758 #if defined(__arm64__)
7759 /*
7760  * Marker for use in multi-pass fast-fault PV list processing.
7761  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7762  * these functions, as compressed PTEs should never be present in PV lists.
7763  * Note that this only holds true for arm64; for arm32 we don't have enough
7764  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7765  * and WRITEABLE marker depending on whether the PTE is valid.
7766  */
7767 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7768 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7769 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7770 #endif
7771 
7772 
7773 MARK_AS_PMAP_TEXT static boolean_t
7774 arm_force_fast_fault_with_flush_range(
7775 	ppnum_t         ppnum,
7776 	vm_prot_t       allow_mode,
7777 	int             options,
7778 	pmap_tlb_flush_range_t *flush_range)
7779 {
7780 	pmap_paddr_t     phys = ptoa(ppnum);
7781 	pv_entry_t      *pve_p;
7782 	pt_entry_t      *pte_p;
7783 	unsigned int     pai;
7784 	unsigned int     pass1_updated = 0;
7785 	unsigned int     pass2_updated = 0;
7786 	boolean_t        result;
7787 	pv_entry_t     **pv_h;
7788 	bool             is_reusable;
7789 	bool             ref_fault;
7790 	bool             mod_fault;
7791 	bool             clear_write_fault = false;
7792 	bool             ref_aliases_mod = false;
7793 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7794 
7795 	assert(ppnum != vm_page_fictitious_addr);
7796 
7797 	if (!pa_valid(phys)) {
7798 		return FALSE;   /* Not a managed page. */
7799 	}
7800 
7801 	result = TRUE;
7802 	ref_fault = false;
7803 	mod_fault = false;
7804 	pai = pa_index(phys);
7805 	if (__probable(mustsynch)) {
7806 		pvh_lock(pai);
7807 	}
7808 	pv_h = pai_to_pvh(pai);
7809 
7810 #if XNU_MONITOR
7811 	if (__improbable(ppattr_pa_test_monitor(phys))) {
7812 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7813 	}
7814 #endif
7815 	pte_p = PT_ENTRY_NULL;
7816 	pve_p = PV_ENTRY_NULL;
7817 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7818 		pte_p = pvh_ptep(pv_h);
7819 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7820 		pve_p = pvh_pve_list(pv_h);
7821 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7822 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7823 	}
7824 
7825 	is_reusable = ppattr_test_reusable(pai);
7826 
7827 	/*
7828 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7829 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
7830 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7831 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
7832 	 * operation, TLB invalidation may be handled by the caller so it's possible for
7833 	 * tlb_flush_needed to be true while issue_tlbi is false.
7834 	 */
7835 	bool issue_tlbi = false;
7836 	bool tlb_flush_needed = false;
7837 
7838 	pv_entry_t *orig_pve_p = pve_p;
7839 	pt_entry_t *orig_pte_p = pte_p;
7840 	int pve_ptep_idx = 0;
7841 
7842 	/*
7843 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7844 	 * TLB invalidation in pass 2.
7845 	 */
7846 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7847 		pt_entry_t       spte;
7848 		pt_entry_t       tmplate;
7849 
7850 		if (pve_p != PV_ENTRY_NULL) {
7851 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7852 			if (pte_p == PT_ENTRY_NULL) {
7853 				goto fff_skip_pve_pass1;
7854 			}
7855 		}
7856 
7857 #ifdef PVH_FLAG_IOMMU
7858 		if (pvh_ptep_is_iommu(pte_p)) {
7859 			goto fff_skip_pve_pass1;
7860 		}
7861 #endif
7862 		if (*pte_p == ARM_PTE_EMPTY) {
7863 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7864 		}
7865 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7866 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7867 		}
7868 
7869 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7870 		const pmap_t pmap = ptdp->pmap;
7871 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7872 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7873 
7874 		assert(va >= pmap->min && va < pmap->max);
7875 
7876 		/* update pmap stats and ledgers */
7877 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7878 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7879 		if (is_altacct) {
7880 			/*
7881 			 * We do not track "reusable" status for
7882 			 * "alternate accounting" mappings.
7883 			 */
7884 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7885 		    is_reusable &&
7886 		    is_internal &&
7887 		    pmap != kernel_pmap) {
7888 			/* one less "reusable" */
7889 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7890 			/* one more "internal" */
7891 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7892 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7893 
7894 			/*
7895 			 * Since the page is being marked non-reusable, we assume that it will be
7896 			 * modified soon.  Avoid the cost of another trap to handle the fast
7897 			 * fault when we next write to this page.
7898 			 */
7899 			clear_write_fault = true;
7900 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7901 		    !is_reusable &&
7902 		    is_internal &&
7903 		    pmap != kernel_pmap) {
7904 			/* one more "reusable" */
7905 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7906 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7907 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7908 		}
7909 
7910 		bool wiredskip = pte_is_wired(*pte_p) &&
7911 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7912 
7913 		if (wiredskip) {
7914 			result = FALSE;
7915 			goto fff_skip_pve_pass1;
7916 		}
7917 
7918 		spte = *pte_p;
7919 		tmplate = spte;
7920 
7921 #if HAS_FEAT_XS
7922 		/**
7923 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
7924 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
7925 		 */
7926 		assert(!pte_is_xs(pt_attr, spte));
7927 #endif /* HAS_FEAT_XS */
7928 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7929 			/* read protection sets the pte to fault */
7930 			tmplate =  tmplate & ~ARM_PTE_AF;
7931 			ref_fault = true;
7932 		}
7933 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7934 			/* take away write permission if set */
7935 			if (pmap == kernel_pmap) {
7936 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7937 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7938 					pte_set_was_writeable(tmplate, true);
7939 					mod_fault = true;
7940 				}
7941 			} else {
7942 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7943 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7944 					pte_set_was_writeable(tmplate, true);
7945 					mod_fault = true;
7946 				}
7947 			}
7948 		}
7949 
7950 #if MACH_ASSERT && XNU_MONITOR
7951 		if (is_pte_xprr_protected(pmap, spte)) {
7952 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7953 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7954 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7955 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7956 				    ppnum, options, allow_mode);
7957 			}
7958 		}
7959 #endif /* MACH_ASSERT && XNU_MONITOR */
7960 
7961 		if (result && (tmplate != spte)) {
7962 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7963 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
7964 				tlb_flush_needed = true;
7965 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7966 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7967 #ifdef ARM_PTE_FF_MARKER
7968 					assert(!(spte & ARM_PTE_FF_MARKER));
7969 					tmplate |= ARM_PTE_FF_MARKER;
7970 					++pass1_updated;
7971 #endif
7972 					issue_tlbi = true;
7973 				}
7974 			}
7975 			write_pte_fast(pte_p, tmplate);
7976 		}
7977 
7978 fff_skip_pve_pass1:
7979 		pte_p = PT_ENTRY_NULL;
7980 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7981 			pve_ptep_idx = 0;
7982 			pve_p = pve_next(pve_p);
7983 		}
7984 	}
7985 
7986 	if (tlb_flush_needed) {
7987 		FLUSH_PTE_STRONG();
7988 	}
7989 
7990 	if (!issue_tlbi) {
7991 		goto fff_finish;
7992 	}
7993 
7994 	/* Pass 2: Issue any required TLB invalidations */
7995 	pve_p = orig_pve_p;
7996 	pte_p = orig_pte_p;
7997 	pve_ptep_idx = 0;
7998 
7999 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8000 		if (pve_p != PV_ENTRY_NULL) {
8001 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8002 			if (pte_p == PT_ENTRY_NULL) {
8003 				goto fff_skip_pve_pass2;
8004 			}
8005 		}
8006 
8007 #ifdef PVH_FLAG_IOMMU
8008 		if (pvh_ptep_is_iommu(pte_p)) {
8009 			goto fff_skip_pve_pass2;
8010 		}
8011 #endif
8012 
8013 #ifdef ARM_PTE_FF_MARKER
8014 		pt_entry_t spte = *pte_p;
8015 
8016 		if (!(spte & ARM_PTE_FF_MARKER)) {
8017 			goto fff_skip_pve_pass2;
8018 		} else {
8019 			spte &= (~ARM_PTE_FF_MARKER);
8020 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8021 			write_pte_fast(pte_p, spte);
8022 			++pass2_updated;
8023 		}
8024 #endif
8025 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8026 		const pmap_t pmap = ptdp->pmap;
8027 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8028 
8029 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8030 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8031 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8032 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8033 		}
8034 
8035 fff_skip_pve_pass2:
8036 		pte_p = PT_ENTRY_NULL;
8037 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8038 			pve_ptep_idx = 0;
8039 			pve_p = pve_next(pve_p);
8040 		}
8041 	}
8042 
8043 fff_finish:
8044 	if (__improbable(pass1_updated != pass2_updated)) {
8045 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8046 		    __func__, pass1_updated, pass2_updated);
8047 	}
8048 
8049 	/*
8050 	 * If we are using the same approach for ref and mod
8051 	 * faults on this PTE, do not clear the write fault;
8052 	 * this would cause both ref and mod to be set on the
8053 	 * page again, and prevent us from taking ANY read/write
8054 	 * fault on the mapping.
8055 	 */
8056 	if (clear_write_fault && !ref_aliases_mod) {
8057 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8058 	}
8059 	if (tlb_flush_needed) {
8060 		if (flush_range) {
8061 			/* Delayed flush. Signal to the caller that the flush is needed. */
8062 			flush_range->ptfr_flush_needed = true;
8063 		} else {
8064 			sync_tlb_flush();
8065 		}
8066 	}
8067 
8068 	/* update global "reusable" status for this page */
8069 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8070 		ppattr_clear_reusable(pai);
8071 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8072 		ppattr_set_reusable(pai);
8073 	}
8074 
8075 	if (mod_fault) {
8076 		ppattr_set_modfault(pai);
8077 	}
8078 	if (ref_fault) {
8079 		ppattr_set_reffault(pai);
8080 	}
8081 	if (__probable(mustsynch)) {
8082 		pvh_unlock(pai);
8083 	}
8084 	return result;
8085 }
8086 
8087 MARK_AS_PMAP_TEXT boolean_t
8088 arm_force_fast_fault_internal(
8089 	ppnum_t         ppnum,
8090 	vm_prot_t       allow_mode,
8091 	int             options)
8092 {
8093 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8094 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8095 	}
8096 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8097 }
8098 
8099 /*
8100  *	Routine:	arm_force_fast_fault
8101  *
8102  *	Function:
8103  *		Force all mappings for this page to fault according
8104  *		to the access modes allowed, so we can gather ref/modify
8105  *		bits again.
8106  */
8107 
8108 boolean_t
8109 arm_force_fast_fault(
8110 	ppnum_t         ppnum,
8111 	vm_prot_t       allow_mode,
8112 	int             options,
8113 	__unused void   *arg)
8114 {
8115 	pmap_paddr_t    phys = ptoa(ppnum);
8116 
8117 	assert(ppnum != vm_page_fictitious_addr);
8118 
8119 	if (!pa_valid(phys)) {
8120 		return FALSE;   /* Not a managed page. */
8121 	}
8122 
8123 #if XNU_MONITOR
8124 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8125 #else
8126 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8127 #endif
8128 }
8129 
8130 /*
8131  *	Routine:	arm_clear_fast_fault
8132  *
8133  *	Function:
8134  *		Clear pending force fault for all mappings for this page based on
8135  *		the observed fault type, update ref/modify bits.
8136  */
8137 MARK_AS_PMAP_TEXT static boolean_t
8138 arm_clear_fast_fault(
8139 	ppnum_t ppnum,
8140 	vm_prot_t fault_type,
8141 	pt_entry_t *pte_p)
8142 {
8143 	pmap_paddr_t    pa = ptoa(ppnum);
8144 	pv_entry_t     *pve_p;
8145 	unsigned int    pai;
8146 	boolean_t       result;
8147 	bool            tlb_flush_needed = false;
8148 	pv_entry_t    **pv_h;
8149 	unsigned int    npve = 0;
8150 	unsigned int    pass1_updated = 0;
8151 	unsigned int    pass2_updated = 0;
8152 
8153 	assert(ppnum != vm_page_fictitious_addr);
8154 
8155 	if (!pa_valid(pa)) {
8156 		return FALSE;   /* Not a managed page. */
8157 	}
8158 
8159 	result = FALSE;
8160 	pai = pa_index(pa);
8161 	pvh_assert_locked(pai);
8162 	pv_h = pai_to_pvh(pai);
8163 
8164 	pve_p = PV_ENTRY_NULL;
8165 	if (pte_p == PT_ENTRY_NULL) {
8166 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8167 			pte_p = pvh_ptep(pv_h);
8168 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8169 			pve_p = pvh_pve_list(pv_h);
8170 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8171 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8172 		}
8173 	}
8174 
8175 	pv_entry_t *orig_pve_p = pve_p;
8176 	pt_entry_t *orig_pte_p = pte_p;
8177 	int pve_ptep_idx = 0;
8178 
8179 	/*
8180 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8181 	 * TLB invalidation in pass 2.
8182 	 */
8183 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8184 		pt_entry_t spte;
8185 		pt_entry_t tmplate;
8186 
8187 		if (pve_p != PV_ENTRY_NULL) {
8188 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8189 			if (pte_p == PT_ENTRY_NULL) {
8190 				goto cff_skip_pve_pass1;
8191 			}
8192 		}
8193 
8194 #ifdef PVH_FLAG_IOMMU
8195 		if (pvh_ptep_is_iommu(pte_p)) {
8196 			goto cff_skip_pve_pass1;
8197 		}
8198 #endif
8199 		if (*pte_p == ARM_PTE_EMPTY) {
8200 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8201 		}
8202 
8203 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8204 		const pmap_t pmap = ptdp->pmap;
8205 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8206 
8207 		assert(va >= pmap->min && va < pmap->max);
8208 
8209 		spte = *pte_p;
8210 		tmplate = spte;
8211 
8212 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8213 			{
8214 				if (pmap == kernel_pmap) {
8215 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8216 				} else {
8217 					assert(pmap->type != PMAP_TYPE_NESTED);
8218 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8219 				}
8220 			}
8221 
8222 			tmplate |= ARM_PTE_AF;
8223 
8224 			pte_set_was_writeable(tmplate, false);
8225 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8226 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8227 			tmplate = spte | ARM_PTE_AF;
8228 
8229 			{
8230 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8231 			}
8232 		}
8233 
8234 #if MACH_ASSERT && XNU_MONITOR
8235 		if (is_pte_xprr_protected(pmap, spte)) {
8236 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8237 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8238 				    "ppnum=0x%x, fault_type=0x%x",
8239 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8240 				    ppnum, fault_type);
8241 			}
8242 		}
8243 #endif /* MACH_ASSERT && XNU_MONITOR */
8244 
8245 		assert(spte != ARM_PTE_TYPE_FAULT);
8246 		if (spte != tmplate) {
8247 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8248 #ifdef ARM_PTE_FF_MARKER
8249 				assert(!(spte & ARM_PTE_FF_MARKER));
8250 				tmplate |= ARM_PTE_FF_MARKER;
8251 				++pass1_updated;
8252 #endif
8253 				tlb_flush_needed = true;
8254 			}
8255 			write_pte_fast(pte_p, tmplate);
8256 			result = TRUE;
8257 		}
8258 
8259 cff_skip_pve_pass1:
8260 		pte_p = PT_ENTRY_NULL;
8261 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8262 			pve_ptep_idx = 0;
8263 			pve_p = pve_next(pve_p);
8264 			++npve;
8265 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8266 				break;
8267 			}
8268 		}
8269 	}
8270 
8271 	if (!tlb_flush_needed) {
8272 		goto cff_finish;
8273 	}
8274 
8275 	FLUSH_PTE_STRONG();
8276 
8277 	/* Pass 2: Issue any required TLB invalidations */
8278 	pve_p = orig_pve_p;
8279 	pte_p = orig_pte_p;
8280 	pve_ptep_idx = 0;
8281 	npve = 0;
8282 
8283 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8284 		if (pve_p != PV_ENTRY_NULL) {
8285 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8286 			if (pte_p == PT_ENTRY_NULL) {
8287 				goto cff_skip_pve_pass2;
8288 			}
8289 		}
8290 
8291 #ifdef PVH_FLAG_IOMMU
8292 		if (pvh_ptep_is_iommu(pte_p)) {
8293 			goto cff_skip_pve_pass2;
8294 		}
8295 #endif
8296 
8297 #ifdef ARM_PTE_FF_MARKER
8298 		pt_entry_t spte = *pte_p;
8299 
8300 		if (!(spte & ARM_PTE_FF_MARKER)) {
8301 			goto cff_skip_pve_pass2;
8302 		} else {
8303 			spte &= (~ARM_PTE_FF_MARKER);
8304 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8305 			write_pte_fast(pte_p, spte);
8306 			++pass2_updated;
8307 		}
8308 #endif
8309 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8310 		const pmap_t pmap = ptdp->pmap;
8311 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8312 
8313 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8314 		    pmap, true, false);
8315 
8316 cff_skip_pve_pass2:
8317 		pte_p = PT_ENTRY_NULL;
8318 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8319 			pve_ptep_idx = 0;
8320 			pve_p = pve_next(pve_p);
8321 			++npve;
8322 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8323 				break;
8324 			}
8325 		}
8326 	}
8327 
8328 cff_finish:
8329 	if (__improbable(pass1_updated != pass2_updated)) {
8330 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8331 		    __func__, pass1_updated, pass2_updated);
8332 	}
8333 	if (tlb_flush_needed) {
8334 		sync_tlb_flush();
8335 	}
8336 	return result;
8337 }
8338 
8339 /*
8340  * Determine if the fault was induced by software tracking of
8341  * modify/reference bits.  If so, re-enable the mapping (and set
8342  * the appropriate bits).
8343  *
8344  * Returns KERN_SUCCESS if the fault was induced and was
8345  * successfully handled.
8346  *
8347  * Returns KERN_FAILURE if the fault was not induced and
8348  * the function was unable to deal with it.
8349  *
8350  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8351  * disallows this type of access.
8352  *
8353  * Returns KERN_ABORTED if the pmap lock is taken and a
8354  * preemption is pending.
8355  *
8356  */
8357 MARK_AS_PMAP_TEXT kern_return_t
8358 arm_fast_fault_internal(
8359 	pmap_t pmap,
8360 	vm_map_address_t va,
8361 	vm_prot_t fault_type,
8362 	__unused bool was_af_fault,
8363 	__unused bool from_user)
8364 {
8365 	kern_return_t   result = KERN_FAILURE;
8366 	pt_entry_t     *ptep;
8367 	pt_entry_t      spte = ARM_PTE_TYPE_FAULT;
8368 	unsigned int    pai;
8369 	pmap_paddr_t    pa;
8370 	validate_pmap_mutable(pmap);
8371 
8372 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8373 		return KERN_ABORTED;
8374 	}
8375 
8376 	/*
8377 	 * If the entry doesn't exist, is completely invalid, or is already
8378 	 * valid, we can't fix it here.
8379 	 */
8380 
8381 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8382 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8383 	if (ptep != PT_ENTRY_NULL) {
8384 		while (true) {
8385 			spte = *((volatile pt_entry_t*)ptep);
8386 
8387 			pa = pte_to_pa(spte);
8388 
8389 			if ((spte == ARM_PTE_TYPE_FAULT) ||
8390 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8391 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8392 				return result;
8393 			}
8394 
8395 			if (!pa_valid(pa)) {
8396 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8397 #if XNU_MONITOR
8398 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8399 					return KERN_PROTECTION_FAILURE;
8400 				} else
8401 #endif
8402 				return result;
8403 			}
8404 			pai = pa_index(pa);
8405 			pvh_lock(pai);
8406 			if (*ptep == spte) {
8407 				/*
8408 				 * Double-check the spte value, as we care about the AF bit.
8409 				 * It's also possible that pmap_page_protect() transitioned the
8410 				 * PTE to compressed/empty before we grabbed the PVH lock.
8411 				 */
8412 				break;
8413 			}
8414 			pvh_unlock(pai);
8415 		}
8416 	} else {
8417 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8418 		return result;
8419 	}
8420 
8421 
8422 	if ((result != KERN_SUCCESS) &&
8423 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8424 		/*
8425 		 * An attempted access will always clear ref/mod fault state, as
8426 		 * appropriate for the fault type.  arm_clear_fast_fault will
8427 		 * update the associated PTEs for the page as appropriate; if
8428 		 * any PTEs are updated, we redrive the access.  If the mapping
8429 		 * does not actually allow for the attempted access, the
8430 		 * following fault will (hopefully) fail to update any PTEs, and
8431 		 * thus cause arm_fast_fault to decide that it failed to handle
8432 		 * the fault.
8433 		 */
8434 		if (ppattr_test_reffault(pai)) {
8435 			ppattr_clear_reffault(pai);
8436 		}
8437 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8438 			ppattr_clear_modfault(pai);
8439 		}
8440 
8441 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8442 			/*
8443 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8444 			 * cost of not doing so is a another fault in a case
8445 			 * that should already result in an exception.
8446 			 */
8447 			result = KERN_SUCCESS;
8448 		}
8449 	}
8450 
8451 	/*
8452 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8453 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8454 	 * on mappings of the same page
8455 	 */
8456 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8457 		uintptr_t ap_ro, ap_rw, ap_x;
8458 		if (pmap == kernel_pmap) {
8459 			ap_ro = ARM_PTE_AP(AP_RONA);
8460 			ap_rw = ARM_PTE_AP(AP_RWNA);
8461 			ap_x = ARM_PTE_NX;
8462 		} else {
8463 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8464 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8465 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8466 		}
8467 		/*
8468 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8469 		 * hardware they may be xPRR-protected, in which case they'll be handled
8470 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8471 		 * handling path currently does not call arm_fast_fault() without at least
8472 		 * VM_PROT_READ in fault_type.
8473 		 */
8474 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8475 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8476 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8477 				result = KERN_SUCCESS;
8478 			}
8479 		}
8480 	}
8481 
8482 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8483 		/*
8484 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8485 		 * another pending PV list operation or an excessively large PV list.
8486 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8487 		 * taking a fault on the same mapping.
8488 		 */
8489 		result = KERN_SUCCESS;
8490 	}
8491 
8492 	pvh_unlock(pai);
8493 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8494 	return result;
8495 }
8496 
8497 kern_return_t
8498 arm_fast_fault(
8499 	pmap_t pmap,
8500 	vm_map_address_t va,
8501 	vm_prot_t fault_type,
8502 	bool was_af_fault,
8503 	__unused bool from_user)
8504 {
8505 	kern_return_t   result = KERN_FAILURE;
8506 
8507 	if (va < pmap->min || va >= pmap->max) {
8508 		return result;
8509 	}
8510 
8511 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8512 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8513 	    from_user);
8514 
8515 	do {
8516 #if XNU_MONITOR
8517 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8518 #else
8519 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8520 #endif
8521 	} while (result == KERN_ABORTED);
8522 
8523 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8524 
8525 	return result;
8526 }
8527 
8528 void
8529 pmap_copy_page(
8530 	ppnum_t psrc,
8531 	ppnum_t pdst)
8532 {
8533 	bcopy_phys((addr64_t) (ptoa(psrc)),
8534 	    (addr64_t) (ptoa(pdst)),
8535 	    PAGE_SIZE);
8536 }
8537 
8538 
8539 /*
8540  *	pmap_copy_page copies the specified (machine independent) pages.
8541  */
8542 void
8543 pmap_copy_part_page(
8544 	ppnum_t psrc,
8545 	vm_offset_t src_offset,
8546 	ppnum_t pdst,
8547 	vm_offset_t dst_offset,
8548 	vm_size_t len)
8549 {
8550 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8551 	    (addr64_t) (ptoa(pdst) + dst_offset),
8552 	    len);
8553 }
8554 
8555 
8556 /*
8557  *	pmap_zero_page zeros the specified (machine independent) page.
8558  */
8559 void
8560 pmap_zero_page(
8561 	ppnum_t pn)
8562 {
8563 	assert(pn != vm_page_fictitious_addr);
8564 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8565 }
8566 
8567 /*
8568  *	pmap_zero_part_page
8569  *	zeros the specified (machine independent) part of a page.
8570  */
8571 void
8572 pmap_zero_part_page(
8573 	ppnum_t pn,
8574 	vm_offset_t offset,
8575 	vm_size_t len)
8576 {
8577 	assert(pn != vm_page_fictitious_addr);
8578 	assert(offset + len <= PAGE_SIZE);
8579 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8580 }
8581 
8582 void
8583 pmap_map_globals(
8584 	void)
8585 {
8586 	pt_entry_t      *ptep, pte;
8587 
8588 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8589 	assert(ptep != PT_ENTRY_NULL);
8590 	assert(*ptep == ARM_PTE_EMPTY);
8591 
8592 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8593 #if __ARM_KERNEL_PROTECT__
8594 	pte |= ARM_PTE_NG;
8595 #endif /* __ARM_KERNEL_PROTECT__ */
8596 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8597 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8598 	*ptep = pte;
8599 	FLUSH_PTE();
8600 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8601 
8602 #if KASAN
8603 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8604 #endif
8605 }
8606 
8607 vm_offset_t
8608 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8609 {
8610 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8611 		panic("%s: invalid index %u", __func__, index);
8612 	}
8613 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8614 }
8615 
8616 MARK_AS_PMAP_TEXT unsigned int
8617 pmap_map_cpu_windows_copy_internal(
8618 	ppnum_t pn,
8619 	vm_prot_t prot,
8620 	unsigned int wimg_bits)
8621 {
8622 	pt_entry_t      *ptep = NULL, pte;
8623 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8624 	unsigned int    cpu_num;
8625 	unsigned int    i;
8626 	vm_offset_t     cpu_copywindow_vaddr = 0;
8627 	bool            need_strong_sync = false;
8628 
8629 #if XNU_MONITOR
8630 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8631 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8632 #endif
8633 
8634 #if XNU_MONITOR
8635 #ifdef  __ARM_COHERENT_IO__
8636 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8637 		panic("%s: attempted to map a managed page, "
8638 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8639 		    __FUNCTION__,
8640 		    pn, prot, wimg_bits);
8641 	}
8642 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8643 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8644 	}
8645 
8646 #else /* __ARM_COHERENT_IO__ */
8647 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8648 #endif /* __ARM_COHERENT_IO__ */
8649 #endif /* XNU_MONITOR */
8650 	cpu_num = pmap_cpu_data->cpu_number;
8651 
8652 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8653 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8654 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8655 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8656 		if (*ptep == ARM_PTE_TYPE_FAULT) {
8657 			break;
8658 		}
8659 	}
8660 	if (i == CPUWINDOWS_MAX) {
8661 		panic("pmap_map_cpu_windows_copy: out of window");
8662 	}
8663 
8664 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8665 #if __ARM_KERNEL_PROTECT__
8666 	pte |= ARM_PTE_NG;
8667 #endif /* __ARM_KERNEL_PROTECT__ */
8668 
8669 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8670 
8671 	if (prot & VM_PROT_WRITE) {
8672 		pte |= ARM_PTE_AP(AP_RWNA);
8673 	} else {
8674 		pte |= ARM_PTE_AP(AP_RONA);
8675 	}
8676 #if HAS_FEAT_XS
8677 	need_strong_sync = pte_is_xs(native_pt_attr, pte);
8678 #endif
8679 	write_pte_fast(ptep, pte);
8680 	/*
8681 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8682 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8683 	 */
8684 	FLUSH_PTE_STRONG();
8685 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8686 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8687 
8688 	return i;
8689 }
8690 
8691 unsigned int
8692 pmap_map_cpu_windows_copy(
8693 	ppnum_t pn,
8694 	vm_prot_t prot,
8695 	unsigned int wimg_bits)
8696 {
8697 #if XNU_MONITOR
8698 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8699 #else
8700 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8701 #endif
8702 }
8703 
8704 MARK_AS_PMAP_TEXT void
8705 pmap_unmap_cpu_windows_copy_internal(
8706 	unsigned int index)
8707 {
8708 	pt_entry_t      *ptep;
8709 	unsigned int    cpu_num;
8710 	vm_offset_t     cpu_copywindow_vaddr = 0;
8711 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8712 
8713 	cpu_num = pmap_cpu_data->cpu_number;
8714 
8715 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8716 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8717 	 * (which are likely to have been on I/O memory) are complete before
8718 	 * tearing down the mapping. */
8719 	__builtin_arm_dsb(DSB_SY);
8720 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8721 	write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8722 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8723 }
8724 
8725 void
8726 pmap_unmap_cpu_windows_copy(
8727 	unsigned int index)
8728 {
8729 #if XNU_MONITOR
8730 	return pmap_unmap_cpu_windows_copy_ppl(index);
8731 #else
8732 	return pmap_unmap_cpu_windows_copy_internal(index);
8733 #endif
8734 }
8735 
8736 #if XNU_MONITOR
8737 
8738 MARK_AS_PMAP_TEXT void
8739 pmap_invoke_with_page(
8740 	ppnum_t page_number,
8741 	void *ctx,
8742 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8743 {
8744 	#pragma unused(page_number, ctx, callback)
8745 }
8746 
8747 /*
8748  * Loop over every pmap_io_range (I/O ranges marked as owned by
8749  * the PPL in the device tree) and conditionally call callback() on each range
8750  * that needs to be included in the hibernation image.
8751  *
8752  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8753  *                 context is needed in the callback.
8754  * @param callback Callback function invoked on each range (gated by flag).
8755  */
8756 MARK_AS_PMAP_TEXT void
8757 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8758 {
8759 	extern const pmap_io_range_t* io_attr_table;
8760 	extern const unsigned int num_io_rgns;
8761 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8762 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8763 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8764 		}
8765 	}
8766 }
8767 
8768 /**
8769  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8770  * PPL-owned page. Otherwise, do nothing.
8771  *
8772  * @param addr Physical address of the page to set the HASHED flag on.
8773  */
8774 MARK_AS_PMAP_TEXT void
8775 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8776 {
8777 	/* Ignore non-managed kernel memory. */
8778 	if (!pa_valid(addr)) {
8779 		return;
8780 	}
8781 
8782 	const unsigned int pai = pa_index(addr);
8783 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8784 		pv_entry_t **pv_h = pai_to_pvh(pai);
8785 
8786 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
8787 		pvh_lock(pai);
8788 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8789 		pvh_unlock(pai);
8790 	}
8791 }
8792 
8793 /**
8794  * Loop through every physical page in the system and clear out the HASHED flag
8795  * on every PPL-owned page. That flag is used to keep track of which pages have
8796  * been hashed into the hibernation image during the hibernation entry process.
8797  *
8798  * The HASHED flag needs to be cleared out between hibernation cycles because the
8799  * pv_head_table and pp_attr_table's might have been copied into the hibernation
8800  * image with the HASHED flag set on certain pages. It's important to clear the
8801  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8802  * into the hibernation image can't be compromised across hibernation cycles.
8803  */
8804 MARK_AS_PMAP_TEXT void
8805 pmap_clear_ppl_hashed_flag_all(void)
8806 {
8807 	const unsigned int last_index = pa_index(vm_last_phys);
8808 	pv_entry_t **pv_h = NULL;
8809 
8810 	for (int pai = 0; pai < last_index; ++pai) {
8811 		pv_h = pai_to_pvh(pai);
8812 
8813 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8814 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8815 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8816 			pvh_lock(pai);
8817 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8818 			pvh_unlock(pai);
8819 		}
8820 	}
8821 }
8822 
8823 /**
8824  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8825  * ppl_hib driver will call this after all wired pages have been copied into the
8826  * hibernation image.
8827  */
8828 MARK_AS_PMAP_TEXT void
8829 pmap_check_ppl_hashed_flag_all(void)
8830 {
8831 	const unsigned int last_index = pa_index(vm_last_phys);
8832 	pv_entry_t **pv_h = NULL;
8833 
8834 	for (int pai = 0; pai < last_index; ++pai) {
8835 		pv_h = pai_to_pvh(pai);
8836 
8837 		/**
8838 		 * The PMAP stacks are explicitly not saved into the image so skip checking
8839 		 * the pages that contain the PMAP stacks.
8840 		 */
8841 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8842 		    (pai < pa_index(pmap_stacks_end_pa));
8843 
8844 		if (!is_pmap_stack &&
8845 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8846 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8847 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8848 		}
8849 	}
8850 }
8851 
8852 #endif /* XNU_MONITOR */
8853 
8854 /*
8855  * Indicate that a pmap is intended to be used as a nested pmap
8856  * within one or more larger address spaces.  This must be set
8857  * before pmap_nest() is called with this pmap as the 'subordinate'.
8858  */
8859 MARK_AS_PMAP_TEXT void
8860 pmap_set_nested_internal(
8861 	pmap_t pmap)
8862 {
8863 	validate_pmap_mutable(pmap);
8864 	if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
8865 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8866 		    __func__, pmap, pmap->type);
8867 	}
8868 
8869 	/**
8870 	 * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give
8871 	 * this pmap its own nested pmap.
8872 	 */
8873 	if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
8874 		panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
8875 	}
8876 
8877 	pmap_get_pt_ops(pmap)->free_id(pmap);
8878 }
8879 
8880 void
8881 pmap_set_nested(
8882 	pmap_t pmap)
8883 {
8884 #if XNU_MONITOR
8885 	pmap_set_nested_ppl(pmap);
8886 #else
8887 	pmap_set_nested_internal(pmap);
8888 #endif
8889 }
8890 
8891 bool
8892 pmap_is_nested(
8893 	pmap_t pmap)
8894 {
8895 	return pmap->type == PMAP_TYPE_NESTED;
8896 }
8897 
8898 /*
8899  * pmap_trim_range(pmap, start, end)
8900  *
8901  * pmap  = pmap to operate on
8902  * start = start of the range
8903  * end   = end of the range
8904  *
8905  * Attempts to deallocate TTEs for the given range in the nested range.
8906  */
8907 MARK_AS_PMAP_TEXT static void
8908 pmap_trim_range(
8909 	pmap_t pmap,
8910 	addr64_t start,
8911 	addr64_t end)
8912 {
8913 	addr64_t cur;
8914 	addr64_t nested_region_start;
8915 	addr64_t nested_region_end;
8916 	addr64_t adjusted_start;
8917 	addr64_t adjusted_end;
8918 	addr64_t adjust_offmask;
8919 	tt_entry_t * tte_p;
8920 	pt_entry_t * pte_p;
8921 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8922 
8923 	if (__improbable(end < start)) {
8924 		panic("%s: invalid address range, "
8925 		    "pmap=%p, start=%p, end=%p",
8926 		    __func__,
8927 		    pmap, (void*)start, (void*)end);
8928 	}
8929 
8930 	nested_region_start = pmap->nested_region_addr;
8931 	nested_region_end = nested_region_start + pmap->nested_region_size;
8932 
8933 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8934 		panic("%s: range outside nested region %p-%p, "
8935 		    "pmap=%p, start=%p, end=%p",
8936 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
8937 		    pmap, (void*)start, (void*)end);
8938 	}
8939 
8940 	/* Contract the range to TT page boundaries. */
8941 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8942 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8943 	adjusted_end = end & ~adjust_offmask;
8944 
8945 	/* Iterate over the range, trying to remove TTEs. */
8946 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
8947 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8948 
8949 		tte_p = pmap_tte(pmap, cur);
8950 
8951 		if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
8952 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
8953 
8954 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
8955 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
8956 				/* Deallocate for the nested map. */
8957 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8958 			} else if (pmap->type == PMAP_TYPE_USER) {
8959 				/**
8960 				 * Just remove for the parent map. If the leaf table pointed
8961 				 * to by the TTE being removed (owned by the nested pmap)
8962 				 * has any mappings, then this call will panic. This
8963 				 * enforces the policy that tables being trimmed must be
8964 				 * empty to prevent possible use-after-free attacks.
8965 				 */
8966 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8967 			} else {
8968 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8969 			}
8970 		} else {
8971 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8972 		}
8973 	}
8974 
8975 	/* Remove empty L2 TTs. */
8976 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8977 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
8978 
8979 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
8980 		/* For each L1 entry in our range... */
8981 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8982 
8983 		bool remove_tt1e = true;
8984 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
8985 		tt_entry_t * tt2e_start;
8986 		tt_entry_t * tt2e_end;
8987 		tt_entry_t * tt2e_p;
8988 		tt_entry_t tt1e;
8989 
8990 		if (tt1e_p == NULL) {
8991 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8992 			continue;
8993 		}
8994 
8995 		tt1e = *tt1e_p;
8996 
8997 		if (tt1e == ARM_TTE_TYPE_FAULT) {
8998 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8999 			continue;
9000 		}
9001 
9002 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9003 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9004 
9005 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9006 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9007 				/*
9008 				 * If any TTEs are populated, don't remove the
9009 				 * L1 TT.
9010 				 */
9011 				remove_tt1e = false;
9012 			}
9013 		}
9014 
9015 		if (remove_tt1e) {
9016 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9017 		} else {
9018 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9019 		}
9020 	}
9021 }
9022 
9023 /**
9024  * State machine for multi-step pmap trimming. Trimming is the action of
9025  * deallocating the TTEs of the shared region of pmaps down to a given range.
9026  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9027  * disabling preemption for too long. These steps include computing the bounds
9028  * of the shared region, trimming the head of the "grand", trimming the tail of
9029  * the "grand", and trimming the "subord". Some of the steps can be skipped under
9030  * different conditions.
9031  *
9032  * @param grand the pmap in which the pages are nested
9033  * @param subord the pmap from which the pages are shared, or nested
9034  * @param vstart start of the used range in "grand"
9035  * @param size size of the used range
9036  * @param state the current state of the state machine
9037  *
9038  * @return the next state of the state machine, to be used in the next call
9039  *         into this function.
9040  */
9041 MARK_AS_PMAP_TEXT pmap_trim_state_t
9042 pmap_trim_internal(
9043 	pmap_t grand,
9044 	pmap_t subord,
9045 	addr64_t vstart,
9046 	uint64_t size,
9047 	pmap_trim_state_t state)
9048 {
9049 	/* Validation needs to be done regardless of state. */
9050 	addr64_t vend;
9051 
9052 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9053 		panic("%s: grand addr wraps around, "
9054 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9055 		    __func__, grand, subord, (void*)vstart, size, state);
9056 	}
9057 
9058 	validate_pmap_mutable(grand);
9059 	validate_pmap(subord);
9060 
9061 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9062 		panic("%s: subord is of non-nestable type 0x%hhx, "
9063 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9064 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
9065 	}
9066 
9067 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9068 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9069 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9070 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
9071 	}
9072 
9073 	if (__improbable(grand->nested_pmap != subord)) {
9074 		panic("%s: grand->nested != subord, "
9075 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9076 		    __func__, grand, subord, (void*)vstart, size, state);
9077 	}
9078 
9079 	if (__improbable((size != 0) &&
9080 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9081 		panic("%s: grand range not in nested region, "
9082 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9083 		    __func__, grand, subord, (void*)vstart, size, state);
9084 	}
9085 
9086 	/* Trimming starts with figuring out the bounds for the grand. */
9087 	if (state == PMAP_TRIM_STATE_START) {
9088 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9089 
9090 		/**
9091 		 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9092 		 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9093 		 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9094 		 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9095 		 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9096 		 * PMAP_TRIM_STATE_DONE.
9097 		 */
9098 		if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9099 			assert(subord->nested_bounds_set);
9100 
9101 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9102 			if (!grand->nested_bounds_set) {
9103 				/* Inherit the bounds from subord. */
9104 				grand->nested_region_true_start = subord->nested_region_true_start;
9105 				grand->nested_region_true_end = subord->nested_region_true_end;
9106 				grand->nested_bounds_set = true;
9107 			}
9108 
9109 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9110 
9111 			/* Now that the grand has bounds, we are done. */
9112 			return PMAP_TRIM_STATE_DONE;
9113 		}
9114 
9115 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9116 		if ((!subord->nested_bounds_set) && size) {
9117 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9118 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9119 
9120 			subord->nested_region_true_start = vstart;
9121 			subord->nested_region_true_end = vend;
9122 			subord->nested_region_true_start &= ~adjust_offmask;
9123 
9124 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9125 				panic("%s: padded true end wraps around, "
9126 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9127 				    __func__, grand, subord, (void*)vstart, size, state);
9128 			}
9129 
9130 			subord->nested_region_true_end &= ~adjust_offmask;
9131 			subord->nested_bounds_set = true;
9132 		}
9133 
9134 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9135 		if (subord->nested_bounds_set) {
9136 			/* Inherit the bounds from subord. */
9137 			grand->nested_region_true_start = subord->nested_region_true_start;
9138 			grand->nested_region_true_end = subord->nested_region_true_end;
9139 			grand->nested_bounds_set = true;
9140 
9141 			/* If we know the bounds, we can trim the pmap. */
9142 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9143 
9144 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
9145 		} else {
9146 			/* Don't trim if we don't know the bounds. */
9147 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9148 
9149 			return PMAP_TRIM_STATE_DONE;
9150 		}
9151 	}
9152 
9153 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
9154 	if (!grand->nested_bounds_set) {
9155 		panic("%s: !grand->nested_bounds_set, "
9156 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9157 		    __func__, grand, subord, (void*)vstart, size, state);
9158 	}
9159 
9160 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9161 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9162 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9163 		    NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9164 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9165 			    (unsigned int)grand->nested_no_bounds_ref_state);
9166 		}
9167 
9168 #if XNU_MONITOR
9169 		if (pmap_pending_preemption()) {
9170 			return PMAP_TRIM_STATE_GRAND_AFTER;
9171 		}
9172 #endif
9173 
9174 		state = PMAP_TRIM_STATE_GRAND_AFTER;
9175 	}
9176 
9177 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9178 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9179 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9180 		    NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9181 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9182 			    (unsigned int)grand->nested_no_bounds_ref_state);
9183 		}
9184 
9185 #if XNU_MONITOR
9186 		if (pmap_pending_preemption()) {
9187 			return PMAP_TRIM_STATE_SUBORD;
9188 		}
9189 #endif
9190 
9191 		state = PMAP_TRIM_STATE_SUBORD;
9192 	}
9193 
9194 	/* START state is guaranteed to compute the bounds for the subord. */
9195 	if (!subord->nested_bounds_set) {
9196 		panic("%s: !subord->nested_bounds_set, "
9197 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9198 		    __func__, grand, subord, (void*)vstart, size, state);
9199 	}
9200 
9201 	if (state == PMAP_TRIM_STATE_SUBORD) {
9202 		/**
9203 		 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9204 		 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9205 		 * called once grand's nested tables have been fully trimmed, and can only be called once
9206 		 * for each 'grand' pmap.  We use release ordering for the atomics above to ensure that
9207 		 * the state update is visible only once the preceding trim operation is complete.  An
9208 		 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9209 		 * but locking within pmap_trim_range() should make that harmless (and all but one will
9210 		 * ultimately panic due to a failed atomic state CAS).  We use acquire ordering here to
9211 		 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9212 		 * of the state CAS.
9213 		 */
9214 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9215 		    NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9216 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9217 			    (unsigned int)grand->nested_no_bounds_ref_state);
9218 		}
9219 		pmap_trim_subord(subord);
9220 	}
9221 
9222 	return PMAP_TRIM_STATE_DONE;
9223 }
9224 
9225 MARK_AS_PMAP_TEXT static void
9226 pmap_trim_self(pmap_t pmap)
9227 {
9228 	if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9229 		/* If we have a no bounds ref, we need to drop it. */
9230 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9231 		pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9232 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9233 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9234 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9235 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9236 
9237 		if (nested_bounds_set) {
9238 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9239 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9240 		}
9241 		/*
9242 		 * Try trimming the nested pmap, in case we had the
9243 		 * last reference.
9244 		 */
9245 		pmap_trim_subord(pmap->nested_pmap);
9246 	}
9247 }
9248 
9249 /*
9250  * pmap_trim_subord(grand, subord)
9251  *
9252  * grand  = pmap that we have nested subord in
9253  * subord = nested pmap we are attempting to trim
9254  *
9255  * Trims subord if possible
9256  */
9257 MARK_AS_PMAP_TEXT static void
9258 pmap_trim_subord(pmap_t subord)
9259 {
9260 	bool contract_subord = false;
9261 
9262 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9263 
9264 	subord->nested_no_bounds_refcnt--;
9265 
9266 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9267 		/* If this was the last no bounds reference, trim subord. */
9268 		contract_subord = true;
9269 	}
9270 
9271 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9272 
9273 	if (contract_subord) {
9274 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9275 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9276 	}
9277 }
9278 
9279 /**
9280  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9281  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9282  * disabling preemption for too long.
9283  *
9284  * @note When we load the shared region we always create pages tables for the
9285  *       entire region. In practice, the shared cache may use just a portion
9286  *       of that. Before we know the bounds of the shared region, it can
9287  *       already be mapped into processes. Therefore, once the bounds are
9288  *       known, "trimming" comes in handy to remove the unnecessary page
9289  *       tables in the processes the shared region is mapped in, and eventually
9290  *       those in the shared region itself. Note that the shared region must
9291  *       be trimmed after the user processes because it has the L3 entries
9292  *       everyone else is pointing to.
9293  *
9294  * @param grand the pmap in which the pages are nested
9295  * @param subord the pmap from which the pages are shared, or nested
9296  * @param vstart start of the used range in "grand"
9297  * @param size size of the used range
9298  */
9299 void
9300 pmap_trim(
9301 	pmap_t grand,
9302 	pmap_t subord,
9303 	addr64_t vstart,
9304 	uint64_t size)
9305 {
9306 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9307 
9308 #if XNU_MONITOR
9309 	/* On PPL systems, drives the state machine until its done. */
9310 	while (state != PMAP_TRIM_STATE_DONE) {
9311 		__assert_only pmap_trim_state_t old_state = state;
9312 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9313 
9314 		/* Are we making progress? */
9315 		assert(old_state != state);
9316 	}
9317 
9318 	pmap_ledger_check_balance(grand);
9319 	pmap_ledger_check_balance(subord);
9320 #else
9321 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9322 
9323 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9324 	assert(state == PMAP_TRIM_STATE_DONE);
9325 #endif
9326 }
9327 
9328 #if HAS_APPLE_PAC
9329 void *
9330 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9331 {
9332 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9333 		panic("attempt to sign user pointer without process independent key");
9334 	}
9335 
9336 	void *res = NULL;
9337 	uint64_t current_intr_state = pmap_interrupts_disable();
9338 
9339 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9340 
9341 	__compiler_materialize_and_prevent_reordering_on(value);
9342 	switch (key) {
9343 	case ptrauth_key_asia:
9344 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9345 		break;
9346 	case ptrauth_key_asda:
9347 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9348 		break;
9349 	default:
9350 		__builtin_unreachable();
9351 	}
9352 	__compiler_materialize_and_prevent_reordering_on(res);
9353 
9354 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9355 
9356 	pmap_interrupts_restore(current_intr_state);
9357 
9358 	return res;
9359 }
9360 
9361 void *
9362 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9363 {
9364 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9365 }
9366 
9367 void *
9368 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9369 {
9370 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9371 		panic("attempt to auth user pointer without process independent key");
9372 	}
9373 
9374 	void *res = NULL;
9375 	uint64_t current_intr_state = pmap_interrupts_disable();
9376 
9377 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9378 	__compiler_materialize_and_prevent_reordering_on(value);
9379 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9380 	__compiler_materialize_and_prevent_reordering_on(res);
9381 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9382 
9383 	pmap_interrupts_restore(current_intr_state);
9384 
9385 	return res;
9386 }
9387 
9388 void *
9389 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9390 {
9391 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9392 }
9393 #endif /* HAS_APPLE_PAC */
9394 
9395 /*
9396  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9397  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9398  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9399  * return value, to indicate where a preempted [un]nest operation should resume.
9400  * When the return value contains the ending address of the nested region with
9401  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9402  */
9403 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9404 
9405 /*
9406  *	kern_return_t pmap_nest(grand, subord, vstart, size)
9407  *
9408  *	grand  = the pmap that we will nest subord into
9409  *	subord = the pmap that goes into the grand
9410  *	vstart  = start of range in pmap to be inserted
9411  *	size   = Size of nest area (up to 16TB)
9412  *
9413  *	Inserts a pmap into another.  This is used to implement shared segments.
9414  *
9415  */
9416 
9417 /**
9418  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9419  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9420  * This function operates in 3 main phases:
9421  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9422  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9423  *    the mapping range are present in subord.
9424  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9425  *    contains pointers to subord's leaf-level pagetable pages for the specified
9426  *    VA range.
9427  *
9428  * This function may return early due to pending AST_URGENT preemption; if so
9429  * it will indicate the need to be re-entered.
9430  *
9431  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9432  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9433  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9434  * @param size twig-aligned size of the nesting range
9435  * @param vrestart the twig-aligned starting address of the current call.  May contain
9436  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9437  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9438  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9439  *
9440  * @return the virtual address at which to restart the operation, possibly including
9441  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9442  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9443  */
9444 MARK_AS_PMAP_TEXT vm_map_offset_t
9445 pmap_nest_internal(
9446 	pmap_t grand,
9447 	pmap_t subord,
9448 	addr64_t vstart,
9449 	uint64_t size,
9450 	vm_map_offset_t vrestart,
9451 	kern_return_t *krp)
9452 {
9453 	kern_return_t kr = KERN_FAILURE;
9454 	vm_map_offset_t vaddr;
9455 	tt_entry_t     *stte_p;
9456 	tt_entry_t     *gtte_p;
9457 	uint64_t        nested_region_unnested_table_bitmap_size;
9458 	unsigned int*   nested_region_unnested_table_bitmap = NULL;
9459 	uint64_t        new_nested_region_unnested_table_bitmap_size;
9460 	unsigned int*   new_nested_region_unnested_table_bitmap = NULL;
9461 	int             expand_options = 0;
9462 	bool            deref_subord = true;
9463 	bool            grand_locked = false;
9464 
9465 	addr64_t vend;
9466 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9467 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9468 	}
9469 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9470 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9471 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9472 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9473 	}
9474 
9475 	assert(krp != NULL);
9476 	validate_pmap_mutable(grand);
9477 	validate_pmap(subord);
9478 #if XNU_MONITOR
9479 	/*
9480 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9481 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9482 	 * be in the process of being destroyed.  If destruction is already committed,
9483 	 * then the check of ref_count below will cover us.  If destruction is initiated
9484 	 * during or after this call, then pmap_destroy() will catch the non-zero
9485 	 * nested_count.
9486 	 */
9487 	os_atomic_inc(&subord->nested_count, relaxed);
9488 	os_atomic_thread_fence(seq_cst);
9489 #endif
9490 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9491 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9492 	}
9493 
9494 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9495 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9496 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9497 	}
9498 
9499 #if XNU_MONITOR
9500 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9501 #endif
9502 
9503 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9504 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9505 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9506 		    grand, vstart, size, (unsigned long long)vrestart);
9507 	}
9508 
9509 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9510 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9511 	}
9512 
9513 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9514 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9515 	}
9516 
9517 	if (subord->nested_region_unnested_table_bitmap == NULL) {
9518 		nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9519 
9520 		if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9521 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9522 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9523 			    __func__, nested_region_unnested_table_bitmap_size,
9524 			    grand, subord, vstart, size);
9525 		}
9526 
9527 #if XNU_MONITOR
9528 		pmap_paddr_t pa = 0;
9529 
9530 		if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9531 			panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9532 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9533 			    __FUNCTION__, nested_region_unnested_table_bitmap_size,
9534 			    grand, subord, vstart, size);
9535 		}
9536 
9537 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9538 
9539 		if (kr != KERN_SUCCESS) {
9540 			goto nest_cleanup;
9541 		}
9542 
9543 		assert(pa);
9544 
9545 		nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9546 #else
9547 		nested_region_unnested_table_bitmap = kalloc_data(
9548 			nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9549 			Z_WAITOK | Z_ZERO);
9550 #endif
9551 
9552 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9553 			kr = KERN_ABORTED;
9554 			goto nest_cleanup;
9555 		}
9556 
9557 		if (subord->nested_region_unnested_table_bitmap == NULL) {
9558 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9559 			subord->nested_region_addr = vstart;
9560 			subord->nested_region_size = (mach_vm_offset_t) size;
9561 
9562 			/**
9563 			 * Ensure that the rest of the subord->nested_region_* fields are
9564 			 * initialized and visible before setting the nested_region_unnested_table_bitmap
9565 			 * field (which is used as the flag to say that the rest are initialized).
9566 			 */
9567 			__builtin_arm_dmb(DMB_ISHST);
9568 			subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap;
9569 			nested_region_unnested_table_bitmap = NULL;
9570 		}
9571 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9572 		if (nested_region_unnested_table_bitmap != NULL) {
9573 #if XNU_MONITOR
9574 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9575 #else
9576 			kfree_data(nested_region_unnested_table_bitmap,
9577 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9578 #endif
9579 			nested_region_unnested_table_bitmap = NULL;
9580 		}
9581 	}
9582 
9583 	/**
9584 	 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9585 	 * speculated before their initialization.
9586 	 */
9587 	__builtin_arm_dmb(DMB_ISHLD);
9588 
9589 	if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9590 		uint64_t        new_size;
9591 
9592 		nested_region_unnested_table_bitmap = NULL;
9593 		nested_region_unnested_table_bitmap_size = 0ULL;
9594 		new_size =  vend - subord->nested_region_addr;
9595 
9596 		new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9597 
9598 		if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9599 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9600 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9601 			    __func__, new_nested_region_unnested_table_bitmap_size,
9602 			    grand, subord, vstart, size);
9603 		}
9604 
9605 #if XNU_MONITOR
9606 		pmap_paddr_t pa = 0;
9607 
9608 		if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9609 			panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9610 			    "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9611 			    __FUNCTION__, new_nested_region_unnested_table_bitmap_size,
9612 			    grand, subord, vstart, new_size);
9613 		}
9614 
9615 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9616 
9617 		if (kr != KERN_SUCCESS) {
9618 			goto nest_cleanup;
9619 		}
9620 
9621 		assert(pa);
9622 
9623 		new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9624 #else
9625 		new_nested_region_unnested_table_bitmap = kalloc_data(
9626 			new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9627 			Z_WAITOK | Z_ZERO);
9628 #endif
9629 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9630 			kr = KERN_ABORTED;
9631 			goto nest_cleanup;
9632 		}
9633 
9634 		if (subord->nested_region_size < new_size) {
9635 			bcopy(subord->nested_region_unnested_table_bitmap,
9636 			    new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size);
9637 			nested_region_unnested_table_bitmap_size  = subord->nested_region_unnested_table_bitmap_size;
9638 			nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap;
9639 			subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap;
9640 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size;
9641 			subord->nested_region_size = new_size;
9642 			new_nested_region_unnested_table_bitmap = NULL;
9643 		}
9644 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9645 		if (nested_region_unnested_table_bitmap != NULL) {
9646 #if XNU_MONITOR
9647 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9648 #else
9649 			kfree_data(nested_region_unnested_table_bitmap,
9650 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9651 #endif
9652 			nested_region_unnested_table_bitmap = NULL;
9653 		}
9654 		if (new_nested_region_unnested_table_bitmap != NULL) {
9655 #if XNU_MONITOR
9656 			pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9657 #else
9658 			kfree_data(new_nested_region_unnested_table_bitmap,
9659 			    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9660 #endif
9661 			new_nested_region_unnested_table_bitmap = NULL;
9662 		}
9663 	}
9664 
9665 	if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9666 		kr = KERN_ABORTED;
9667 		goto nest_cleanup;
9668 	}
9669 
9670 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) {
9671 		/**
9672 		 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9673 		 * into a nested pmap, which would then produce multiple levels of nesting.
9674 		 */
9675 		if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9676 			panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9677 		}
9678 		/*
9679 		 * If this is grand's first nesting operation, keep the reference on subord.
9680 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9681 		 */
9682 		deref_subord = false;
9683 
9684 		if (!subord->nested_bounds_set) {
9685 			/*
9686 			 * We are nesting without the shared regions bounds
9687 			 * being known.  We'll have to trim the pmap later.
9688 			 */
9689 			if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9690 			    NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9691 				panic("%s: grand %p already nested", __func__, grand);
9692 			}
9693 			subord->nested_no_bounds_refcnt++;
9694 		}
9695 
9696 		if (__improbable(vstart < subord->nested_region_addr ||
9697 		    vend > (subord->nested_region_addr + subord->nested_region_size))) {
9698 			panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9699 			    __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9700 			    (void *) (subord->nested_region_addr + subord->nested_region_size));
9701 		}
9702 
9703 		grand->nested_region_addr = vstart;
9704 		grand->nested_region_size = (mach_vm_offset_t) size;
9705 	} else {
9706 		if (__improbable(grand->nested_pmap != subord)) {
9707 			panic("pmap_nest() pmap %p has a nested pmap", grand);
9708 		} else if (__improbable(grand->nested_region_addr > vstart)) {
9709 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9710 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9711 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9712 		}
9713 	}
9714 
9715 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9716 	if (vaddr < subord->nested_region_true_start) {
9717 		vaddr = subord->nested_region_true_start;
9718 	}
9719 
9720 	addr64_t true_end = vend;
9721 	if (true_end > subord->nested_region_true_end) {
9722 		true_end = subord->nested_region_true_end;
9723 	}
9724 	__unused unsigned int ttecount = 0;
9725 
9726 	if (vrestart & PMAP_NEST_GRAND) {
9727 		goto nest_grand;
9728 	}
9729 
9730 	while (vaddr < true_end) {
9731 		stte_p = pmap_tte(subord, vaddr);
9732 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9733 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9734 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9735 
9736 			if (kr != KERN_SUCCESS) {
9737 				goto done;
9738 			}
9739 
9740 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9741 		}
9742 		vaddr += pt_attr_twig_size(pt_attr);
9743 		vrestart = vaddr;
9744 		++ttecount;
9745 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9746 		    pmap_pending_preemption())) {
9747 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9748 			kr = KERN_SUCCESS;
9749 			goto done;
9750 		}
9751 	}
9752 	/*
9753 	 * copy TTEs from subord pmap into grand pmap
9754 	 */
9755 
9756 	vaddr = (vm_map_offset_t) vstart;
9757 	if (vaddr < subord->nested_region_true_start) {
9758 		vaddr = subord->nested_region_true_start;
9759 	}
9760 	vrestart = vaddr | PMAP_NEST_GRAND;
9761 
9762 nest_grand:
9763 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9764 
9765 	if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9766 		kr = KERN_ABORTED;
9767 		goto done;
9768 	}
9769 	while (vaddr < true_end) {
9770 		stte_p = pmap_tte(subord, vaddr);
9771 		if (__improbable(stte_p == PT_ENTRY_NULL)) {
9772 			panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
9773 		}
9774 		gtte_p = pmap_tte(grand, vaddr);
9775 		if (gtte_p == PT_ENTRY_NULL) {
9776 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9777 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9778 			if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9779 				if (kr == KERN_SUCCESS) {
9780 					kr = KERN_ABORTED;
9781 				}
9782 			}
9783 
9784 			if (kr != KERN_SUCCESS) {
9785 				goto done;
9786 			}
9787 
9788 			gtte_p = pmap_tt2e(grand, vaddr);
9789 		}
9790 		/* Don't leak a page table page.  Don't violate break-before-make. */
9791 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9792 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9793 			    __func__, gtte_p, grand);
9794 		}
9795 		*gtte_p = *stte_p;
9796 
9797 		vaddr += pt_attr_twig_size(pt_attr);
9798 		vrestart = vaddr | PMAP_NEST_GRAND;
9799 		++ttecount;
9800 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9801 		    pmap_pending_preemption())) {
9802 			break;
9803 		}
9804 	}
9805 	if (vaddr >= true_end) {
9806 		vrestart = vend | PMAP_NEST_GRAND;
9807 	}
9808 
9809 	kr = KERN_SUCCESS;
9810 done:
9811 
9812 	FLUSH_PTE();
9813 	__builtin_arm_isb(ISB_SY);
9814 
9815 	if (grand_locked) {
9816 		pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9817 	}
9818 
9819 nest_cleanup:
9820 #if XNU_MONITOR
9821 	if (kr != KERN_SUCCESS) {
9822 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9823 		*krp = kr;
9824 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9825 	}
9826 #else
9827 	if (kr != KERN_SUCCESS) {
9828 		*krp = kr;
9829 	}
9830 #endif
9831 	if (nested_region_unnested_table_bitmap != NULL) {
9832 #if XNU_MONITOR
9833 		pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9834 #else
9835 		kfree_data(nested_region_unnested_table_bitmap,
9836 		    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9837 #endif
9838 	}
9839 	if (new_nested_region_unnested_table_bitmap != NULL) {
9840 #if XNU_MONITOR
9841 		pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9842 #else
9843 		kfree_data(new_nested_region_unnested_table_bitmap,
9844 		    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9845 #endif
9846 	}
9847 	if (deref_subord) {
9848 #if XNU_MONITOR
9849 		os_atomic_dec(&subord->nested_count, relaxed);
9850 #endif
9851 		pmap_destroy_internal(subord);
9852 	}
9853 	return vrestart;
9854 }
9855 
9856 kern_return_t
9857 pmap_nest(
9858 	pmap_t grand,
9859 	pmap_t subord,
9860 	addr64_t vstart,
9861 	uint64_t size)
9862 {
9863 	kern_return_t kr = KERN_SUCCESS;
9864 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9865 	vm_map_offset_t vend = vaddr + size;
9866 	__unused vm_map_offset_t vlast = vaddr;
9867 
9868 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9869 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9870 	    VM_KERNEL_ADDRHIDE(vstart));
9871 
9872 	pmap_verify_preemptible();
9873 #if XNU_MONITOR
9874 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
9875 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9876 		if (kr == KERN_RESOURCE_SHORTAGE) {
9877 			pmap_alloc_page_for_ppl(0);
9878 			kr = KERN_SUCCESS;
9879 		} else if (kr == KERN_ABORTED) {
9880 			/**
9881 			 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
9882 			 * that it won't update kr when KERN_SUCCESS is to be returned.
9883 			 * Therefore, the KERN_ABORTED needs to be manually cleared here,
9884 			 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
9885 			 */
9886 			kr = KERN_SUCCESS;
9887 			continue;
9888 		} else if (kr != KERN_SUCCESS) {
9889 			break;
9890 		} else if (vaddr == vlast) {
9891 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9892 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9893 		}
9894 		vlast = vaddr;
9895 	}
9896 
9897 	pmap_ledger_check_balance(grand);
9898 	pmap_ledger_check_balance(subord);
9899 #else
9900 	/**
9901 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9902 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9903 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9904 	 */
9905 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9906 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9907 	}
9908 #endif
9909 
9910 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9911 
9912 	return kr;
9913 }
9914 
9915 /*
9916  *	kern_return_t pmap_unnest(grand, vaddr)
9917  *
9918  *	grand  = the pmap that will have the virtual range unnested
9919  *	vaddr  = start of range in pmap to be unnested
9920  *	size   = size of range in pmap to be unnested
9921  *
9922  */
9923 
9924 kern_return_t
9925 pmap_unnest(
9926 	pmap_t grand,
9927 	addr64_t vaddr,
9928 	uint64_t size)
9929 {
9930 	return pmap_unnest_options(grand, vaddr, size, 0);
9931 }
9932 
9933 /**
9934  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9935  * from a top-level pmap ('grand').  The corresponding mappings in the nested
9936  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9937  * still have the region nested.  The mappings in 'grand' will be left empty
9938  * with the assumption that they will be demand-filled by subsequent access faults.
9939  *
9940  * This function operates in 2 main phases:
9941  * 1. Iteration over the nested pmap's mappings for the specified range to mark
9942  *    them non-global.
9943  * 2. Clearing of the twig-level TTEs for the address range in grand.
9944  *
9945  * This function may return early due to pending AST_URGENT preemption; if so
9946  * it will indicate the need to be re-entered.
9947  *
9948  * @param grand pmap from which to unnest mappings
9949  * @param vaddr twig-aligned virtual address for the beginning of the nested range
9950  * @param size twig-aligned size of the nested range
9951  * @param vrestart the page-aligned starting address of the current call.  May contain
9952  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9953  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9954  *        grand is being torn down and step 1) above is not needed.
9955  *
9956  * @return the virtual address at which to restart the operation, possibly including
9957  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9958  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9959  */
9960 MARK_AS_PMAP_TEXT vm_map_offset_t
9961 pmap_unnest_options_internal(
9962 	pmap_t grand,
9963 	addr64_t vaddr,
9964 	uint64_t size,
9965 	vm_map_offset_t vrestart,
9966 	unsigned int option)
9967 {
9968 	vm_map_offset_t start;
9969 	vm_map_offset_t addr;
9970 	tt_entry_t     *tte_p;
9971 	unsigned int    current_index;
9972 	unsigned int    start_index;
9973 	unsigned int    max_index;
9974 	unsigned int    entry_count = 0;
9975 
9976 	addr64_t vend;
9977 	addr64_t true_end;
9978 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9979 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9980 	}
9981 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9982 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9983 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9984 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9985 	}
9986 
9987 	validate_pmap_mutable(grand);
9988 
9989 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9990 
9991 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9992 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9993 		    (unsigned long long)vaddr, (unsigned long long)size);
9994 	}
9995 
9996 	if (__improbable(grand->nested_pmap == NULL)) {
9997 		panic("%s: %p has no nested pmap", __func__, grand);
9998 	}
9999 
10000 	true_end = vend;
10001 	if (true_end > grand->nested_pmap->nested_region_true_end) {
10002 		true_end = grand->nested_pmap->nested_region_true_end;
10003 	}
10004 
10005 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10006 		if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10007 			panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10008 		}
10009 
10010 		if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10011 			return vrestart;
10012 		}
10013 
10014 		start = vrestart;
10015 		if (start < grand->nested_pmap->nested_region_true_start) {
10016 			start = grand->nested_pmap->nested_region_true_start;
10017 		}
10018 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10019 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10020 		bool flush_tlb = false;
10021 
10022 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10023 			pt_entry_t  *bpte, *cpte;
10024 
10025 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10026 
10027 			bpte = pmap_pte(grand->nested_pmap, addr);
10028 
10029 			/*
10030 			 * If we've re-entered this function partway through unnesting a leaf region, the
10031 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10032 			 * the run of PTEs.  We therefore also need to check for a non-twig-aligned starting
10033 			 * address.
10034 			 */
10035 			if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10036 			    (addr & pt_attr_twig_offmask(pt_attr))) {
10037 				/*
10038 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
10039 				 * the nested pmap in this region will now be marked non-global.  Do this
10040 				 * before marking any of the PTEs within the region as non-global to avoid
10041 				 * the possibility of pmap_enter() subsequently inserting a global mapping
10042 				 * in the region, which could lead to a TLB conflict if a non-global entry
10043 				 * is later inserted for the same VA in a pmap which has fully unnested this
10044 				 * region.
10045 				 */
10046 				setbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10047 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10048 					pmap_paddr_t    pa;
10049 					unsigned int    pai = 0;
10050 					boolean_t               managed = FALSE;
10051 					pt_entry_t  spte;
10052 
10053 					if ((*cpte != ARM_PTE_TYPE_FAULT)
10054 					    && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10055 						spte = *((volatile pt_entry_t*)cpte);
10056 						while (!managed) {
10057 							pa = pte_to_pa(spte);
10058 							if (!pa_valid(pa)) {
10059 								break;
10060 							}
10061 							pai = pa_index(pa);
10062 							pvh_lock(pai);
10063 							spte = *((volatile pt_entry_t*)cpte);
10064 							pa = pte_to_pa(spte);
10065 							if (pai == pa_index(pa)) {
10066 								managed = TRUE;
10067 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
10068 							}
10069 							pvh_unlock(pai);
10070 						}
10071 
10072 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10073 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
10074 							flush_tlb = true;
10075 						}
10076 
10077 						if (managed) {
10078 							pvh_assert_locked(pai);
10079 							pvh_unlock(pai);
10080 						}
10081 					}
10082 
10083 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10084 					vrestart = addr;
10085 					++entry_count;
10086 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10087 					    pmap_pending_preemption())) {
10088 						goto unnest_subord_done;
10089 					}
10090 				}
10091 			}
10092 			addr = vlim;
10093 			vrestart = addr;
10094 			++entry_count;
10095 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10096 			    pmap_pending_preemption())) {
10097 				break;
10098 			}
10099 		}
10100 
10101 unnest_subord_done:
10102 		if (flush_tlb) {
10103 			FLUSH_PTE_STRONG();
10104 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10105 		}
10106 
10107 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10108 		if (current_index < max_index) {
10109 			return vrestart;
10110 		}
10111 	}
10112 
10113 	/*
10114 	 * invalidate all pdes for segment at vaddr in pmap grand
10115 	 */
10116 	if (vrestart & PMAP_NEST_GRAND) {
10117 		addr = vrestart & ~PMAP_NEST_GRAND;
10118 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10119 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10120 		}
10121 	} else {
10122 		addr = vaddr;
10123 		vrestart = vaddr | PMAP_NEST_GRAND;
10124 	}
10125 
10126 	/**
10127 	 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10128 	 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10129 	 * upon reentry.
10130 	 */
10131 	if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10132 		return vrestart;
10133 	}
10134 
10135 	if (addr < grand->nested_pmap->nested_region_true_start) {
10136 		addr = grand->nested_pmap->nested_region_true_start;
10137 	}
10138 
10139 	start = addr;
10140 
10141 	while (addr < true_end) {
10142 		tte_p = pmap_tte(grand, addr);
10143 		/*
10144 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10145 		 * so it's possible that a region we're trying to unnest may not have been
10146 		 * nested in the first place.
10147 		 */
10148 		if (tte_p != NULL) {
10149 			*tte_p = ARM_TTE_TYPE_FAULT;
10150 		}
10151 		addr += pt_attr_twig_size(pt_attr);
10152 		vrestart = addr | PMAP_NEST_GRAND;
10153 		++entry_count;
10154 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10155 		    pmap_pending_preemption())) {
10156 			break;
10157 		}
10158 	}
10159 	if (addr >= true_end) {
10160 		vrestart = vend | PMAP_NEST_GRAND;
10161 	}
10162 
10163 	FLUSH_PTE_STRONG();
10164 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10165 
10166 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10167 
10168 	return vrestart;
10169 }
10170 
10171 kern_return_t
10172 pmap_unnest_options(
10173 	pmap_t grand,
10174 	addr64_t vaddr,
10175 	uint64_t size,
10176 	unsigned int option)
10177 {
10178 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10179 	vm_map_offset_t vend = vaddr + size;
10180 
10181 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10182 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10183 
10184 	pmap_verify_preemptible();
10185 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
10186 #if XNU_MONITOR
10187 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10188 #else
10189 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10190 #endif
10191 	}
10192 
10193 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10194 
10195 	return KERN_SUCCESS;
10196 }
10197 
10198 boolean_t
10199 pmap_adjust_unnest_parameters(
10200 	__unused pmap_t p,
10201 	__unused vm_map_offset_t *s,
10202 	__unused vm_map_offset_t *e)
10203 {
10204 	return TRUE; /* to get to log_unnest_badness()... */
10205 }
10206 
10207 #if PMAP_FORK_NEST
10208 /**
10209  * Perform any necessary pre-nesting of the parent's shared region at fork()
10210  * time.
10211  *
10212  * @note This should only be called from vm_map_fork().
10213  *
10214  * @param old_pmap The pmap of the parent task.
10215  * @param new_pmap The pmap of the child task.
10216  * @param nesting_start An output parameter that is updated with the start
10217  *                      address of the range that was pre-nested
10218  * @param nesting_end An output parameter that is updated with the end
10219  *                      address of the range that was pre-nested
10220  *
10221  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10222  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
10223  */
10224 kern_return_t
10225 pmap_fork_nest(
10226 	pmap_t old_pmap,
10227 	pmap_t new_pmap,
10228 	vm_map_offset_t *nesting_start,
10229 	vm_map_offset_t *nesting_end)
10230 {
10231 	if (old_pmap == NULL || new_pmap == NULL) {
10232 		return KERN_INVALID_ARGUMENT;
10233 	}
10234 	if (old_pmap->nested_pmap == NULL) {
10235 		return KERN_SUCCESS;
10236 	}
10237 	pmap_nest(new_pmap,
10238 	    old_pmap->nested_pmap,
10239 	    old_pmap->nested_region_addr,
10240 	    old_pmap->nested_region_size);
10241 	assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10242 	    new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10243 	    new_pmap->nested_region_size == old_pmap->nested_region_size,
10244 	    "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10245 	    new_pmap->nested_pmap,
10246 	    new_pmap->nested_region_addr,
10247 	    new_pmap->nested_region_size,
10248 	    old_pmap->nested_pmap,
10249 	    old_pmap->nested_region_addr,
10250 	    old_pmap->nested_region_size);
10251 	*nesting_start = old_pmap->nested_region_addr;
10252 	*nesting_end = *nesting_start + old_pmap->nested_region_size;
10253 	return KERN_SUCCESS;
10254 }
10255 #endif /* PMAP_FORK_NEST */
10256 
10257 /*
10258  * disable no-execute capability on
10259  * the specified pmap
10260  */
10261 #if DEVELOPMENT || DEBUG
10262 void
10263 pmap_disable_NX(
10264 	pmap_t pmap)
10265 {
10266 	pmap->nx_enabled = FALSE;
10267 }
10268 #else
10269 void
10270 pmap_disable_NX(
10271 	__unused pmap_t pmap)
10272 {
10273 }
10274 #endif
10275 
10276 /*
10277  * flush a range of hardware TLB entries.
10278  * NOTE: assumes the smallest TLB entry in use will be for
10279  * an ARM small page (4K).
10280  */
10281 
10282 #if __ARM_RANGE_TLBI__
10283 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10284 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
10285 #else
10286 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10287 #endif // __ARM_RANGE_TLBI__
10288 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10289     "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10290     "of npages to 32 bits below may truncate.");
10291 
10292 static void
10293 flush_mmu_tlb_region_asid_async(
10294 	vm_offset_t va,
10295 	size_t length,
10296 	pmap_t pmap,
10297 	bool last_level_only __unused,
10298 	bool strong __unused)
10299 {
10300 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10301 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10302 	size_t npages = length >> pmap_page_shift;
10303 	uint32_t asid;
10304 
10305 	asid = pmap->hw_asid;
10306 
10307 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10308 		boolean_t       flush_all = FALSE;
10309 
10310 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10311 			flush_all = TRUE;
10312 		}
10313 		if (flush_all) {
10314 			flush_mmu_tlb_async();
10315 		} else {
10316 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10317 		}
10318 		return;
10319 	}
10320 #if __ARM_RANGE_TLBI__
10321 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10322 		/**
10323 		 * Note that casting npages to 32 bits here is always safe thanks to
10324 		 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10325 		 */
10326 		va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10327 		if (pmap->type == PMAP_TYPE_NESTED) {
10328 			flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10329 		} else {
10330 			flush_mmu_tlb_range_async(va, last_level_only, strong);
10331 		}
10332 		return;
10333 	}
10334 #endif
10335 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10336 	va = tlbi_asid(asid) | tlbi_addr(va);
10337 
10338 	if (pmap->type == PMAP_TYPE_NESTED) {
10339 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10340 	} else {
10341 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10342 	}
10343 }
10344 
10345 MARK_AS_PMAP_TEXT static void
10346 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10347 {
10348 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10349 }
10350 
10351 void
10352 flush_mmu_tlb_region(
10353 	vm_offset_t va,
10354 	unsigned length)
10355 {
10356 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10357 	sync_tlb_flush();
10358 }
10359 
10360 unsigned int
10361 pmap_cache_attributes(
10362 	ppnum_t pn)
10363 {
10364 	pmap_paddr_t    paddr;
10365 	unsigned int    pai;
10366 	unsigned int    result;
10367 	pp_attr_t       pp_attr_current;
10368 
10369 	paddr = ptoa(pn);
10370 
10371 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10372 
10373 	if (!pa_valid(paddr)) {
10374 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10375 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10376 	}
10377 
10378 	result = VM_WIMG_DEFAULT;
10379 
10380 	pai = pa_index(paddr);
10381 
10382 	pp_attr_current = pp_attr_table[pai];
10383 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10384 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10385 	}
10386 	return result;
10387 }
10388 
10389 MARK_AS_PMAP_TEXT static void
10390 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10391 {
10392 	if ((wimg_bits_prev != wimg_bits_new)
10393 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10394 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10395 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10396 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10397 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10398 		pmap_sync_page_attributes_phys(pn);
10399 	}
10400 
10401 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10402 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10403 	}
10404 }
10405 
10406 MARK_AS_PMAP_TEXT __unused void
10407 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10408 {
10409 	pmap_paddr_t paddr = ptoa(pn);
10410 	const unsigned int pai = pa_index(paddr);
10411 
10412 	if (__improbable(!pa_valid(paddr))) {
10413 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10414 	}
10415 
10416 	pvh_lock(pai);
10417 
10418 #if XNU_MONITOR
10419 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10420 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10421 	}
10422 #endif
10423 
10424 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10425 
10426 	pvh_unlock(pai);
10427 
10428 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10429 }
10430 
10431 void *
10432 pmap_map_compressor_page(ppnum_t pn)
10433 {
10434 #if __ARM_PTE_PHYSMAP__
10435 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10436 	if (cacheattr != VM_WIMG_DEFAULT) {
10437 #if XNU_MONITOR
10438 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10439 #else
10440 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10441 #endif
10442 	}
10443 #endif
10444 	return (void*)phystokv(ptoa(pn));
10445 }
10446 
10447 void
10448 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10449 {
10450 #if __ARM_PTE_PHYSMAP__
10451 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10452 	if (cacheattr != VM_WIMG_DEFAULT) {
10453 #if XNU_MONITOR
10454 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10455 #else
10456 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10457 #endif
10458 	}
10459 #endif
10460 }
10461 
10462 /**
10463  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10464  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10465  *
10466  * @param user_page_list List of pages to be updated.
10467  * @param page_cnt Number of pages in total in user_page_list.
10468  * @param cacheattr The new cache attribute.
10469  *
10470  * @return Success if true is returned.
10471  */
10472 bool
10473 pmap_batch_set_cache_attributes(
10474 	upl_page_info_array_t user_page_list,
10475 	unsigned int page_cnt,
10476 	unsigned int cacheattr)
10477 {
10478 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10479 
10480 	if (page_cnt == 0) {
10481 		return true;
10482 	}
10483 
10484 	batch_set_cache_attr_state_t states;
10485 	states.page_index = 0;
10486 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10487 	states.tlb_flush_pass_needed = false;
10488 	states.rt_cache_flush_pass_needed = false;
10489 
10490 	/* Verify we are being called from a preemptible context. */
10491 	pmap_verify_preemptible();
10492 
10493 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10494 #if XNU_MONITOR
10495 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10496 #else /* !XNU_MONITOR */
10497 		states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10498 #endif /* XNU_MONITOR */
10499 	}
10500 
10501 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10502 	return true;
10503 }
10504 
10505 /**
10506  * Flushes TLB entries associated with the page specified by paddr, but do not
10507  * issue barriers yet.
10508  *
10509  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10510  */
10511 MARK_AS_PMAP_TEXT static void
10512 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10513 {
10514 #if __ARM_PTE_PHYSMAP__
10515 	/* Flush the physical aperture mappings. */
10516 	const vm_offset_t kva = phystokv(paddr);
10517 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10518 #endif /* __ARM_PTE_PHYSMAP__ */
10519 
10520 	/* Flush the mappings tracked in the ptes. */
10521 	const unsigned int pai = pa_index(paddr);
10522 	pv_entry_t **pv_h = pai_to_pvh(pai);
10523 
10524 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10525 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10526 
10527 	pvh_assert_locked(pai);
10528 
10529 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10530 		pte_p = pvh_ptep(pv_h);
10531 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10532 		pve_p = pvh_pve_list(pv_h);
10533 		pte_p = PT_ENTRY_NULL;
10534 	}
10535 
10536 	int pve_ptep_idx = 0;
10537 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10538 		if (pve_p != PV_ENTRY_NULL) {
10539 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10540 			if (pte_p == PT_ENTRY_NULL) {
10541 				goto flush_tlb_skip_pte;
10542 			}
10543 		}
10544 
10545 #ifdef PVH_FLAG_IOMMU
10546 		if (pvh_ptep_is_iommu(pte_p)) {
10547 			goto flush_tlb_skip_pte;
10548 		}
10549 #endif /* PVH_FLAG_IOMMU */
10550 		pmap_t pmap = ptep_get_pmap(pte_p);
10551 		vm_map_address_t va = ptep_get_va(pte_p);
10552 
10553 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10554 		    pmap, true, false);
10555 
10556 flush_tlb_skip_pte:
10557 		pte_p = PT_ENTRY_NULL;
10558 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10559 			pve_ptep_idx = 0;
10560 			pve_p = pve_next(pve_p);
10561 		}
10562 	}
10563 }
10564 
10565 /**
10566  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10567  *
10568  * @param pai The Physical Address Index of the entry.
10569  * @param cacheattr The new cache attribute.
10570  */
10571 MARK_AS_PMAP_TEXT static void
10572 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10573 {
10574 	pvh_assert_locked(pai);
10575 
10576 	pp_attr_t pp_attr_current, pp_attr_template;
10577 	do {
10578 		pp_attr_current = pp_attr_table[pai];
10579 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10580 
10581 		/**
10582 		 * WIMG bits should only be updated under the PVH lock, but we should do
10583 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10584 		 */
10585 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10586 }
10587 
10588 /**
10589  * Batch updates the cache attributes of a list of pages in three passes.
10590  *
10591  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10592  * In pass two, TLB entries are flushed for each page in the list if necessary.
10593  * In pass three, caches are cleaned for each page in the list if necessary.
10594  *
10595  * When running in PPL, this function may decide to return to the caller in response
10596  * to AST_URGENT.
10597  *
10598  * @param user_page_list List of pages to be updated.
10599  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10600  * @param page_cnt Number of pages in total in user_page_list.
10601  * @param cacheattr The new cache attributes.
10602  *
10603  * @return The new state of the state machine.
10604  */
10605 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10606 pmap_batch_set_cache_attributes_internal(
10607 #if XNU_MONITOR
10608 	volatile upl_page_info_t *user_page_list,
10609 #else /* !XNU_MONITOR */
10610 	upl_page_info_array_t user_page_list,
10611 #endif /* XNU_MONITOR */
10612 	batch_set_cache_attr_state_t states,
10613 	unsigned int page_cnt,
10614 	unsigned int cacheattr)
10615 {
10616 	uint64_t page_index = states.page_index;
10617 	uint64_t state = states.state;
10618 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10619 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10620 
10621 	/* For verifying progress. */
10622 	__assert_only const uint64_t page_index_old = page_index;
10623 	__assert_only const uint64_t state_old = state;
10624 
10625 	/* Assert page_index and state are within their range. */
10626 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10627 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10628 	}
10629 
10630 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10631 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10632 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10633 		while (page_index < page_cnt) {
10634 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10635 			const pmap_paddr_t paddr = ptoa(pn);
10636 
10637 			if (!pa_valid(paddr)) {
10638 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10639 			}
10640 
10641 			const unsigned int pai = pa_index(paddr);
10642 
10643 			/* Lock the page. */
10644 			pvh_lock(pai);
10645 
10646 #if XNU_MONITOR
10647 			if (ppattr_pa_test_monitor(paddr)) {
10648 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10649 			}
10650 #endif /* XNU_MONITOR */
10651 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10652 
10653 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10654 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10655 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10656 			}
10657 
10658 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10659 
10660 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10661 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10662 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10663 			}
10664 
10665 			/* Update the cache attributes in PTE and PP_ATTR table. */
10666 			if (wimg_bits_new != wimg_bits_prev) {
10667 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10668 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10669 			}
10670 
10671 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10672 				rt_cache_flush_pass_needed = true;
10673 			}
10674 
10675 			pvh_unlock(pai);
10676 
10677 			page_index++;
10678 
10679 #if XNU_MONITOR
10680 			/**
10681 			 * Check for AST_URGENT every page, as the pve list search in cache
10682 			 * update can take non-constant time.
10683 			 */
10684 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10685 				goto pbscai_exit;
10686 			}
10687 #endif /* XNU_MONITOR */
10688 		}
10689 
10690 		/* page_index == page_cnt && !pmap_pending_preemption() */
10691 		if (tlb_flush_pass_needed) {
10692 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10693 		} else if (rt_cache_flush_pass_needed) {
10694 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10695 		} else {
10696 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10697 		}
10698 		page_index = 0;
10699 
10700 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10701 		FLUSH_PTE_STRONG();
10702 
10703 #if XNU_MONITOR
10704 		if (__improbable(pmap_pending_preemption())) {
10705 			goto pbscai_exit;
10706 		}
10707 #endif /* XNU_MONITOR */
10708 	}
10709 
10710 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10711 		/**
10712 		 * Pass 2: for each physical page and for each mapping, we need to flush
10713 		 * the TLB for it.
10714 		 */
10715 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10716 		while (page_index < page_cnt) {
10717 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10718 
10719 			const pmap_paddr_t paddr = ptoa(pn);
10720 			if (!pa_valid(paddr)) {
10721 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10722 			}
10723 
10724 			const unsigned int pai = pa_index(paddr);
10725 
10726 			pvh_lock(pai);
10727 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10728 			pvh_unlock(pai);
10729 
10730 			page_index++;
10731 
10732 #if XNU_MONITOR
10733 			/**
10734 			 * Check for AST_URGENT every page, as the pve list search in cache
10735 			 * update can take non-constant time.
10736 			 */
10737 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10738 				goto pbscai_exit;
10739 			}
10740 #endif /* XNU_MONITOR */
10741 		}
10742 
10743 #if HAS_FEAT_XS
10744 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10745 		arm64_sync_tlb(false);
10746 #else
10747 		/**
10748 		 * For targets that distinguish between mild and strong DSB, mild DSB
10749 		 * will not drain the prefetcher.  This can lead to prefetch-driven
10750 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
10751 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10752 		 */
10753 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10754 #endif
10755 
10756 		if (rt_cache_flush_pass_needed) {
10757 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10758 		} else {
10759 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10760 		}
10761 		page_index = 0;
10762 
10763 #if XNU_MONITOR
10764 		if (__improbable(pmap_pending_preemption())) {
10765 			goto pbscai_exit;
10766 		}
10767 #endif /* XNU_MONITOR */
10768 	}
10769 
10770 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10771 		/* Pass 3: Flush the cache if the page is recently set to RT */
10772 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10773 #if !XNU_MONITOR
10774 		/**
10775 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10776 		 * in the state where DC by VA instructions remain enabled.
10777 		 */
10778 		disable_preemption();
10779 #endif /* !XNU_MONITOR */
10780 
10781 		assert(get_preemption_level() > 0);
10782 
10783 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10784 		/**
10785 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10786 		 * and the host will handle cache maintenance for it. So we don't need to
10787 		 * worry about enabling the ops here for AVP.
10788 		 */
10789 		enable_dc_mva_ops();
10790 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10791 
10792 		while (page_index < page_cnt) {
10793 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10794 
10795 			if (!pa_valid(paddr)) {
10796 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10797 			}
10798 
10799 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10800 
10801 			page_index++;
10802 
10803 #if XNU_MONITOR
10804 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10805 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10806 				disable_dc_mva_ops();
10807 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10808 				goto pbscai_exit;
10809 			}
10810 #endif /* XNU_MONITOR */
10811 		}
10812 
10813 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10814 		disable_dc_mva_ops();
10815 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10816 
10817 #if !XNU_MONITOR
10818 		enable_preemption();
10819 #endif /* !XNU_MONITOR */
10820 
10821 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10822 		page_index = 0;
10823 	}
10824 
10825 #if XNU_MONITOR
10826 pbscai_exit:
10827 #endif /* XNU_MONITOR */
10828 	/* Assert page_index and state are within their range. */
10829 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10830 
10831 	/* Make sure we are making progress in this call. */
10832 	assert(page_index > page_index_old || state > state_old);
10833 
10834 	batch_set_cache_attr_state_t states_new;
10835 	states_new.page_index = page_index;
10836 	states_new.state = state;
10837 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10838 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10839 	return states_new;
10840 }
10841 
10842 MARK_AS_PMAP_TEXT static void
10843 pmap_set_cache_attributes_priv(
10844 	ppnum_t pn,
10845 	unsigned int cacheattr,
10846 	boolean_t external __unused)
10847 {
10848 	pmap_paddr_t    paddr;
10849 	unsigned int    pai;
10850 	pp_attr_t       pp_attr_current;
10851 	pp_attr_t       pp_attr_template;
10852 	unsigned int    wimg_bits_prev, wimg_bits_new;
10853 
10854 	paddr = ptoa(pn);
10855 
10856 	if (!pa_valid(paddr)) {
10857 		return;                         /* Not a managed page. */
10858 	}
10859 
10860 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
10861 		cacheattr = VM_WIMG_DEFAULT;
10862 	}
10863 
10864 	pai = pa_index(paddr);
10865 
10866 	pvh_lock(pai);
10867 
10868 #if XNU_MONITOR
10869 	if (external && ppattr_pa_test_monitor(paddr)) {
10870 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10871 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
10872 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10873 	}
10874 #endif
10875 
10876 	do {
10877 		pp_attr_current = pp_attr_table[pai];
10878 		wimg_bits_prev = VM_WIMG_DEFAULT;
10879 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10880 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10881 		}
10882 
10883 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10884 
10885 		/**
10886 		 * WIMG bits should only be updated under the PVH lock, but we should do
10887 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10888 		 */
10889 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10890 
10891 	wimg_bits_new = VM_WIMG_DEFAULT;
10892 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10893 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10894 	}
10895 
10896 	if (wimg_bits_new != wimg_bits_prev) {
10897 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
10898 	}
10899 
10900 	pvh_unlock(pai);
10901 
10902 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10903 }
10904 
10905 MARK_AS_PMAP_TEXT void
10906 pmap_set_cache_attributes_internal(
10907 	ppnum_t pn,
10908 	unsigned int cacheattr)
10909 {
10910 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10911 }
10912 
10913 void
10914 pmap_set_cache_attributes(
10915 	ppnum_t pn,
10916 	unsigned int cacheattr)
10917 {
10918 #if XNU_MONITOR
10919 	pmap_set_cache_attributes_ppl(pn, cacheattr);
10920 #else
10921 	pmap_set_cache_attributes_internal(pn, cacheattr);
10922 #endif
10923 }
10924 
10925 /**
10926  * Updates the page numbered ppnum to have attribute specified by attributes.
10927  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10928  * The necessity of the TLB flush is returned in case this function is called
10929  * in a batched manner and the TLB flush is intended to be done at a different
10930  * timing.
10931  *
10932  * @param ppnum Page Number of the page to be updated.
10933  * @param attributes The new cache attributes.
10934  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
10935  *        immediately.
10936  *
10937  * @return Returns true if a TLB flush is needed for this update regardless of
10938  *         whether a flush has occurred already.
10939  */
10940 MARK_AS_PMAP_TEXT bool
10941 pmap_update_cache_attributes_locked(
10942 	ppnum_t ppnum,
10943 	unsigned attributes,
10944 	bool perform_tlbi)
10945 {
10946 	pmap_paddr_t    phys = ptoa(ppnum);
10947 	pv_entry_t      *pve_p;
10948 	pt_entry_t      *pte_p;
10949 	pv_entry_t      **pv_h;
10950 	pt_entry_t      tmplate;
10951 	unsigned int    pai;
10952 	boolean_t       tlb_flush_needed = false;
10953 
10954 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10955 
10956 	if (pmap_panic_dev_wimg_on_managed) {
10957 		switch (attributes & VM_WIMG_MASK) {
10958 		case VM_WIMG_IO:                        // nGnRnE
10959 		case VM_WIMG_POSTED:                    // nGnRE
10960 		/* supported on DRAM, but slow, so we disallow */
10961 
10962 		case VM_WIMG_POSTED_REORDERED:          // nGRE
10963 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10964 			/* unsupported on DRAM */
10965 
10966 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10967 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10968 			break;
10969 
10970 		default:
10971 			/* not device type memory, all good */
10972 
10973 			break;
10974 		}
10975 	}
10976 
10977 #if __ARM_PTE_PHYSMAP__
10978 	vm_offset_t kva = phystokv(phys);
10979 	pte_p = pmap_pte(kernel_pmap, kva);
10980 
10981 	tmplate = *pte_p;
10982 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10983 #if XNU_MONITOR
10984 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10985 #else
10986 	tmplate |= wimg_to_pte(attributes, phys);
10987 #endif
10988 	if (tmplate & ARM_PTE_HINT_MASK) {
10989 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10990 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
10991 	}
10992 
10993 	if (perform_tlbi) {
10994 		write_pte_strong(pte_p, tmplate);
10995 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10996 	} else {
10997 		write_pte_fast(pte_p, tmplate);
10998 	}
10999 	tlb_flush_needed = true;
11000 #endif
11001 
11002 	pai = pa_index(phys);
11003 
11004 	pv_h = pai_to_pvh(pai);
11005 
11006 	pte_p = PT_ENTRY_NULL;
11007 	pve_p = PV_ENTRY_NULL;
11008 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11009 		pte_p = pvh_ptep(pv_h);
11010 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11011 		pve_p = pvh_pve_list(pv_h);
11012 		pte_p = PT_ENTRY_NULL;
11013 	}
11014 
11015 	int pve_ptep_idx = 0;
11016 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11017 		vm_map_address_t va;
11018 		pmap_t          pmap;
11019 
11020 		if (pve_p != PV_ENTRY_NULL) {
11021 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11022 			if (pte_p == PT_ENTRY_NULL) {
11023 				goto cache_skip_pve;
11024 			}
11025 		}
11026 
11027 #ifdef PVH_FLAG_IOMMU
11028 		if (pvh_ptep_is_iommu(pte_p)) {
11029 			goto cache_skip_pve;
11030 		}
11031 #endif
11032 		pmap = ptep_get_pmap(pte_p);
11033 #if HAS_FEAT_XS
11034 		/**
11035 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11036 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11037 		 */
11038 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11039 #endif /* HAS_FEAT_XS */
11040 		va = ptep_get_va(pte_p);
11041 
11042 		tmplate = *pte_p;
11043 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11044 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11045 
11046 		if (perform_tlbi) {
11047 			write_pte_strong(pte_p, tmplate);
11048 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11049 			    pmap, true, false);
11050 		} else {
11051 			write_pte_fast(pte_p, tmplate);
11052 		}
11053 		tlb_flush_needed = true;
11054 
11055 cache_skip_pve:
11056 		pte_p = PT_ENTRY_NULL;
11057 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11058 			pve_ptep_idx = 0;
11059 			pve_p = pve_next(pve_p);
11060 		}
11061 	}
11062 	if (perform_tlbi && tlb_flush_needed) {
11063 #if HAS_FEAT_XS
11064 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11065 		arm64_sync_tlb(false);
11066 #else
11067 		/**
11068 		 * For targets that distinguish between mild and strong DSB, mild DSB
11069 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11070 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11071 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11072 		 */
11073 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11074 #endif
11075 	}
11076 
11077 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11078 
11079 	return tlb_flush_needed;
11080 }
11081 
11082 /**
11083  * Mark a pmap as being dedicated to use for a commpage mapping.
11084  * The pmap itself will never be activated on a CPU; its mappings will
11085  * only be embedded in userspace pmaps at a fixed virtual address.
11086  *
11087  * @param pmap the pmap to mark as belonging to a commpage.
11088  */
11089 static void
11090 pmap_set_commpage(pmap_t pmap)
11091 {
11092 #if XNU_MONITOR
11093 	assert(!pmap_ppl_locked_down);
11094 #endif
11095 	assert(pmap->type == PMAP_TYPE_USER);
11096 	pmap->type = PMAP_TYPE_COMMPAGE;
11097 	/*
11098 	 * Free the pmap's ASID.  This pmap should not ever be directly
11099 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
11100 	 * ASID space contention but will also cause pmap_switch() to panic
11101 	 * if an attacker tries to activate this pmap.  Disable preemption to
11102 	 * accommodate the *_nopreempt spinlock in free_asid().
11103 	 */
11104 	mp_disable_preemption();
11105 	pmap_get_pt_ops(pmap)->free_id(pmap);
11106 	mp_enable_preemption();
11107 }
11108 
11109 static void
11110 pmap_update_tt3e(
11111 	pmap_t pmap,
11112 	vm_address_t address,
11113 	tt_entry_t template)
11114 {
11115 	tt_entry_t *ptep, pte;
11116 
11117 	ptep = pmap_tt3e(pmap, address);
11118 	if (ptep == NULL) {
11119 		panic("%s: no ptep?", __FUNCTION__);
11120 	}
11121 
11122 	pte = *ptep;
11123 	pte = tte_to_pa(pte) | template;
11124 	write_pte_strong(ptep, pte);
11125 }
11126 
11127 /* Note absence of non-global bit */
11128 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11129 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11130 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11131 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11132 
11133 /* Note absence of non-global bit and no-execute bit.  */
11134 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11135 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11136 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11137 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11138 
11139 void
11140 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11141     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11142 {
11143 	kern_return_t kr;
11144 	pmap_paddr_t data_pa = 0; // data address
11145 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11146 	pmap_paddr_t text_pa = 0; // text address
11147 
11148 	*kernel_data_addr = 0;
11149 	*kernel_text_addr = 0;
11150 	*user_text_addr = 0;
11151 
11152 #if XNU_MONITOR
11153 	data_pa = pmap_alloc_page_for_kern(0);
11154 	assert(data_pa);
11155 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11156 	ro_data_pa = pmap_alloc_page_for_kern(0);
11157 	assert(ro_data_pa);
11158 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11159 #if CONFIG_ARM_PFZ
11160 	text_pa = pmap_alloc_page_for_kern(0);
11161 	assert(text_pa);
11162 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11163 #endif
11164 
11165 #else /* XNU_MONITOR */
11166 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11167 	/*
11168 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11169 	 * mapped at page granularity, so a separate page for kernel RO data would not
11170 	 * be useful.
11171 	 */
11172 	ro_data_pa = data_pa;
11173 #if CONFIG_ARM_PFZ
11174 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11175 #endif
11176 
11177 #endif /* XNU_MONITOR */
11178 
11179 	/*
11180 	 * In order to avoid burning extra pages on mapping the shared page, we
11181 	 * create a dedicated pmap for the shared page.  We forcibly nest the
11182 	 * translation tables from this pmap into other pmaps.  The level we
11183 	 * will nest at depends on the MMU configuration (page size, TTBR range,
11184 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11185 	 *
11186 	 * Note that this is NOT "the nested pmap" (which is used to nest the
11187 	 * shared cache).
11188 	 *
11189 	 * Note that we update parameters of the entry for our unique needs (NG
11190 	 * entry, etc.).
11191 	 */
11192 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11193 	assert(commpage_pmap_default != NULL);
11194 	pmap_set_commpage(commpage_pmap_default);
11195 
11196 	/* The user 64-bit mappings... */
11197 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11198 	assert(kr == KERN_SUCCESS);
11199 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11200 
11201 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11202 	assert(kr == KERN_SUCCESS);
11203 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11204 #if CONFIG_ARM_PFZ
11205 	/* User mapping of comm page text section for 64 bit mapping only
11206 	 *
11207 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11208 	 * user processes to get this page mapped in, they should never call into
11209 	 * this page.
11210 	 *
11211 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11212 	 * is slid in the same L3 as the data commpage.  It is either outside the
11213 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11214 	 * it is reserved and unavailable to mach VM for future mappings.
11215 	 */
11216 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11217 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11218 
11219 	vm_map_address_t commpage_text_va = 0;
11220 
11221 	do {
11222 		int text_leaf_index = random() % num_ptes;
11223 
11224 		// Generate a VA for the commpage text with the same root and twig index as data
11225 		// comm page, but with new leaf index we've just generated.
11226 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11227 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11228 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11229 
11230 	// Assert that this is empty
11231 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11232 	assert(ptep != PT_ENTRY_NULL);
11233 	assert(*ptep == ARM_TTE_EMPTY);
11234 
11235 	// At this point, we've found the address we want to insert our comm page at
11236 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11237 	assert(kr == KERN_SUCCESS);
11238 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11239 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11240 
11241 	*user_text_addr = commpage_text_va;
11242 #endif
11243 
11244 	/* ...and the user 32-bit mappings. */
11245 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11246 	assert(kr == KERN_SUCCESS);
11247 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11248 
11249 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11250 	assert(kr == KERN_SUCCESS);
11251 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11252 #if __ARM_MIXED_PAGE_SIZE__
11253 	/**
11254 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11255 	 * new set of page tables that point to the exact same 16K shared page as
11256 	 * before. Only the first 4K of the 16K shared page is mapped since that's
11257 	 * the only part that contains relevant data.
11258 	 */
11259 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11260 	assert(commpage_pmap_4k != NULL);
11261 	pmap_set_commpage(commpage_pmap_4k);
11262 
11263 	/* The user 64-bit mappings... */
11264 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11265 	assert(kr == KERN_SUCCESS);
11266 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11267 
11268 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11269 	assert(kr == KERN_SUCCESS);
11270 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11271 
11272 	/* ...and the user 32-bit mapping. */
11273 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11274 	assert(kr == KERN_SUCCESS);
11275 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11276 
11277 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11278 	assert(kr == KERN_SUCCESS);
11279 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11280 #endif
11281 
11282 	/* For manipulation in kernel, go straight to physical page */
11283 	*kernel_data_addr = phystokv(data_pa);
11284 	assert(commpage_ro_data_kva == 0);
11285 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11286 	assert(commpage_text_kva == 0);
11287 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11288 }
11289 
11290 
11291 /*
11292  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11293  * with user controlled TTEs for regions that aren't explicitly reserved by the
11294  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11295  */
11296 #if (ARM_PGSHIFT == 14)
11297 /**
11298  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11299  * commpage completely above the maximum 32-bit userspace VA.
11300  */
11301 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11302 
11303 /**
11304  * Normally there'd be an assert to check that 64-bit devices with 64-bit
11305  * userspace VAs can nest the commpage completely above the maximum 64-bit
11306  * userpace VA, but that technically isn't true on macOS. On those systems, the
11307  * commpage lives within the userspace VA range, but is protected by the VM as
11308  * a reserved region (see vm_reserved_regions[] definition for more info).
11309  */
11310 
11311 #elif (ARM_PGSHIFT == 12)
11312 /**
11313  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11314  * above the maximum userspace VA.
11315  */
11316 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11317 #else
11318 #error Nested shared page mapping is unsupported on this config
11319 #endif
11320 
11321 MARK_AS_PMAP_TEXT kern_return_t
11322 pmap_insert_commpage_internal(
11323 	pmap_t pmap)
11324 {
11325 	kern_return_t kr = KERN_SUCCESS;
11326 	vm_offset_t commpage_vaddr;
11327 	pt_entry_t *ttep, *src_ttep;
11328 	int options = 0;
11329 	pmap_t commpage_pmap = commpage_pmap_default;
11330 
11331 	/* Validate the pmap input before accessing its data. */
11332 	validate_pmap_mutable(pmap);
11333 
11334 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11335 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11336 
11337 #if __ARM_MIXED_PAGE_SIZE__
11338 #if !__ARM_16K_PG__
11339 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11340 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11341 #endif /* !__ARM_16K_PG__ */
11342 
11343 	/* Choose the correct shared page pmap to use. */
11344 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11345 	if (pmap_page_size == 16384) {
11346 		commpage_pmap = commpage_pmap_default;
11347 	} else if (pmap_page_size == 4096) {
11348 		commpage_pmap = commpage_pmap_4k;
11349 	} else {
11350 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11351 	}
11352 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11353 
11354 #if XNU_MONITOR
11355 	options |= PMAP_OPTIONS_NOWAIT;
11356 #endif /* XNU_MONITOR */
11357 
11358 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11359 #error We assume a single page.
11360 #endif
11361 
11362 	if (pmap_is_64bit(pmap)) {
11363 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11364 	} else {
11365 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11366 	}
11367 
11368 
11369 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11370 
11371 	/*
11372 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11373 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11374 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11375 	 * to "nest".
11376 	 *
11377 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11378 	 * nesting just means inserting pointers to pre-allocated tables inside of
11379 	 * the passed in pmap to allow us to share page tables (which map the shared
11380 	 * page) for every task. This saves at least one page of memory per process
11381 	 * compared to creating new page tables in every process for mapping the
11382 	 * shared page.
11383 	 */
11384 
11385 	/**
11386 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11387 	 * page's tables into place.
11388 	 */
11389 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11390 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11391 
11392 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11393 
11394 		if (kr != KERN_SUCCESS) {
11395 #if XNU_MONITOR
11396 			if (kr == KERN_RESOURCE_SHORTAGE) {
11397 				return kr;
11398 			} else
11399 #endif
11400 			if (kr == KERN_ABORTED) {
11401 				return kr;
11402 			} else {
11403 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11404 			}
11405 		}
11406 
11407 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11408 	}
11409 
11410 	if (*ttep != ARM_PTE_EMPTY) {
11411 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11412 	}
11413 
11414 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11415 
11416 	*ttep = *src_ttep;
11417 	FLUSH_PTE_STRONG();
11418 
11419 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11420 
11421 	return kr;
11422 }
11423 
11424 static void
11425 pmap_unmap_commpage(
11426 	pmap_t pmap)
11427 {
11428 	pt_entry_t *ttep;
11429 	vm_offset_t commpage_vaddr;
11430 	pmap_t commpage_pmap = commpage_pmap_default;
11431 
11432 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11433 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11434 
11435 #if __ARM_MIXED_PAGE_SIZE__
11436 #if !__ARM_16K_PG__
11437 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11438 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11439 #endif /* !__ARM_16K_PG__ */
11440 
11441 	/* Choose the correct shared page pmap to use. */
11442 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11443 	if (pmap_page_size == 16384) {
11444 		commpage_pmap = commpage_pmap_default;
11445 	} else if (pmap_page_size == 4096) {
11446 		commpage_pmap = commpage_pmap_4k;
11447 	} else {
11448 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11449 	}
11450 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11451 
11452 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11453 #error We assume a single page.
11454 #endif
11455 
11456 	if (pmap_is_64bit(pmap)) {
11457 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11458 	} else {
11459 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11460 	}
11461 
11462 
11463 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11464 
11465 	if (ttep == NULL) {
11466 		return;
11467 	}
11468 
11469 	/* It had better be mapped to the shared page. */
11470 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11471 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11472 	}
11473 
11474 	*ttep = ARM_TTE_EMPTY;
11475 	FLUSH_PTE_STRONG();
11476 
11477 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11478 	sync_tlb_flush();
11479 }
11480 
11481 void
11482 pmap_insert_commpage(
11483 	pmap_t pmap)
11484 {
11485 	kern_return_t kr = KERN_FAILURE;
11486 #if XNU_MONITOR
11487 	do {
11488 		kr = pmap_insert_commpage_ppl(pmap);
11489 
11490 		if (kr == KERN_RESOURCE_SHORTAGE) {
11491 			pmap_alloc_page_for_ppl(0);
11492 		}
11493 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11494 
11495 	pmap_ledger_check_balance(pmap);
11496 #else
11497 	do {
11498 		kr = pmap_insert_commpage_internal(pmap);
11499 	} while (kr == KERN_ABORTED);
11500 #endif
11501 
11502 	if (kr != KERN_SUCCESS) {
11503 		panic("%s: failed to insert the shared page, kr=%d, "
11504 		    "pmap=%p",
11505 		    __FUNCTION__, kr,
11506 		    pmap);
11507 	}
11508 }
11509 
11510 static boolean_t
11511 pmap_is_64bit(
11512 	pmap_t pmap)
11513 {
11514 	return pmap->is_64bit;
11515 }
11516 
11517 bool
11518 pmap_is_exotic(
11519 	pmap_t pmap __unused)
11520 {
11521 	return false;
11522 }
11523 
11524 
11525 /* ARMTODO -- an implementation that accounts for
11526  * holes in the physical map, if any.
11527  */
11528 boolean_t
11529 pmap_valid_page(
11530 	ppnum_t pn)
11531 {
11532 	return pa_valid(ptoa(pn));
11533 }
11534 
11535 boolean_t
11536 pmap_bootloader_page(
11537 	ppnum_t pn)
11538 {
11539 	pmap_paddr_t paddr = ptoa(pn);
11540 
11541 	if (pa_valid(paddr)) {
11542 		return FALSE;
11543 	}
11544 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11545 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11546 }
11547 
11548 MARK_AS_PMAP_TEXT boolean_t
11549 pmap_is_empty_internal(
11550 	pmap_t pmap,
11551 	vm_map_offset_t va_start,
11552 	vm_map_offset_t va_end)
11553 {
11554 	vm_map_offset_t block_start, block_end;
11555 	tt_entry_t *tte_p;
11556 
11557 	if (pmap == NULL) {
11558 		return TRUE;
11559 	}
11560 
11561 	validate_pmap(pmap);
11562 
11563 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11564 	unsigned int initial_not_in_kdp = not_in_kdp;
11565 
11566 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11567 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11568 	}
11569 
11570 
11571 	/* TODO: This will be faster if we increment ttep at each level. */
11572 	block_start = va_start;
11573 
11574 	while (block_start < va_end) {
11575 		pt_entry_t     *bpte_p, *epte_p;
11576 		pt_entry_t     *pte_p;
11577 
11578 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11579 		if (block_end > va_end) {
11580 			block_end = va_end;
11581 		}
11582 
11583 		tte_p = pmap_tte(pmap, block_start);
11584 		if ((tte_p != PT_ENTRY_NULL)
11585 		    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11586 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11587 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11588 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11589 
11590 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11591 				if (*pte_p != ARM_PTE_EMPTY) {
11592 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11593 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11594 					}
11595 					return FALSE;
11596 				}
11597 			}
11598 		}
11599 		block_start = block_end;
11600 	}
11601 
11602 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11603 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11604 	}
11605 
11606 	return TRUE;
11607 }
11608 
11609 boolean_t
11610 pmap_is_empty(
11611 	pmap_t pmap,
11612 	vm_map_offset_t va_start,
11613 	vm_map_offset_t va_end)
11614 {
11615 #if XNU_MONITOR
11616 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11617 #else
11618 	return pmap_is_empty_internal(pmap, va_start, va_end);
11619 #endif
11620 }
11621 
11622 vm_map_offset_t
11623 pmap_max_offset(
11624 	boolean_t               is64,
11625 	unsigned int    option)
11626 {
11627 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11628 }
11629 
11630 vm_map_offset_t
11631 pmap_max_64bit_offset(
11632 	__unused unsigned int option)
11633 {
11634 	vm_map_offset_t max_offset_ret = 0;
11635 
11636 #if defined(__arm64__)
11637 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11638 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11639 		max_offset_ret = arm64_pmap_max_offset_default;
11640 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11641 		max_offset_ret = min_max_offset;
11642 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11643 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11644 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11645 		if (arm64_pmap_max_offset_default) {
11646 			max_offset_ret = arm64_pmap_max_offset_default;
11647 		} else if (max_mem > 0xC0000000) {
11648 			// devices with > 3GB of memory
11649 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11650 		} else if (max_mem > 0x40000000) {
11651 			// devices with > 1GB and <= 3GB of memory
11652 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11653 		} else {
11654 			// devices with <= 1 GB of memory
11655 			max_offset_ret = min_max_offset;
11656 		}
11657 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11658 		if (arm64_pmap_max_offset_default) {
11659 			// Allow the boot-arg to override jumbo size
11660 			max_offset_ret = arm64_pmap_max_offset_default;
11661 		} else {
11662 			max_offset_ret = MACH_VM_MAX_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11663 		}
11664 	} else {
11665 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11666 	}
11667 
11668 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11669 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11670 		assert(max_offset_ret >= min_max_offset);
11671 	}
11672 #else
11673 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11674 #endif
11675 
11676 	return max_offset_ret;
11677 }
11678 
11679 vm_map_offset_t
11680 pmap_max_32bit_offset(
11681 	unsigned int option)
11682 {
11683 	vm_map_offset_t max_offset_ret = 0;
11684 
11685 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11686 		max_offset_ret = arm_pmap_max_offset_default;
11687 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11688 		max_offset_ret = VM_MAX_ADDRESS;
11689 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11690 		max_offset_ret = VM_MAX_ADDRESS;
11691 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11692 		if (arm_pmap_max_offset_default) {
11693 			max_offset_ret = arm_pmap_max_offset_default;
11694 		} else if (max_mem > 0x20000000) {
11695 			max_offset_ret = VM_MAX_ADDRESS;
11696 		} else {
11697 			max_offset_ret = VM_MAX_ADDRESS;
11698 		}
11699 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11700 		max_offset_ret = VM_MAX_ADDRESS;
11701 	} else {
11702 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11703 	}
11704 
11705 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11706 	return max_offset_ret;
11707 }
11708 
11709 #if CONFIG_DTRACE
11710 /*
11711  * Constrain DTrace copyin/copyout actions
11712  */
11713 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11714 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11715 
11716 kern_return_t
11717 dtrace_copyio_preflight(
11718 	__unused addr64_t va)
11719 {
11720 	if (current_map() == kernel_map) {
11721 		return KERN_FAILURE;
11722 	} else {
11723 		return KERN_SUCCESS;
11724 	}
11725 }
11726 
11727 kern_return_t
11728 dtrace_copyio_postflight(
11729 	__unused addr64_t va)
11730 {
11731 	return KERN_SUCCESS;
11732 }
11733 #endif /* CONFIG_DTRACE */
11734 
11735 
11736 void
11737 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11738 {
11739 }
11740 
11741 
11742 void
11743 pmap_flush(
11744 	__unused pmap_flush_context *cpus_to_flush)
11745 {
11746 	/* not implemented yet */
11747 	return;
11748 }
11749 
11750 #if XNU_MONITOR
11751 
11752 /*
11753  * Enforce that the address range described by kva and nbytes is not currently
11754  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
11755  * unintentionally writing to PPL-owned memory.
11756  */
11757 void
11758 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11759 {
11760 	vm_offset_t end;
11761 	if (os_add_overflow(kva, nbytes, &end)) {
11762 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11763 	}
11764 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11765 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11766 		pp_attr_t attr;
11767 		unsigned int pai = pa_index(pa);
11768 		if (ckva == phystokv(pa)) {
11769 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11770 		}
11771 		do {
11772 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11773 			if (attr & PP_ATTR_MONITOR) {
11774 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11775 			}
11776 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11777 	}
11778 }
11779 
11780 void
11781 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11782 {
11783 	vm_offset_t end;
11784 	if (os_add_overflow(kva, nbytes, &end)) {
11785 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11786 	}
11787 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11788 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11789 
11790 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11791 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11792 		}
11793 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11794 		ppattr_pa_clear_no_monitor(pa);
11795 	}
11796 }
11797 
11798 /**
11799  * Lock down a page, making all mappings read-only, and preventing further
11800  * mappings or removal of this particular kva's mapping. Effectively, it makes
11801  * the physical page at kva immutable (see the ppl_writable parameter for an
11802  * exception to this).
11803  *
11804  * @param kva Valid address to any mapping of the physical page to lockdown.
11805  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11806  * @param ppl_writable True if the PPL should still be able to write to the page
11807  *                     using the physical aperture mapping. False will make the
11808  *                     page read-only for both the kernel and PPL in the
11809  *                     physical aperture.
11810  */
11811 
11812 MARK_AS_PMAP_TEXT static void
11813 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11814 {
11815 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11816 }
11817 
11818 /**
11819  * Lock down a page, giving all mappings the specified maximum permissions, and
11820  * preventing further mappings or removal of this particular kva's mapping.
11821  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11822  * parameter for an exception to this).
11823  *
11824  * @param kva Valid address to any mapping of the physical page to lockdown.
11825  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11826  * @param ppl_writable True if the PPL should still be able to write to the page
11827  *                     using the physical aperture mapping. False will make the
11828  *                     page read-only for both the kernel and PPL in the
11829  *                     physical aperture.
11830  * @param prot Maximum permissions to allow in existing alias mappings
11831  */
11832 MARK_AS_PMAP_TEXT static void
11833 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11834 {
11835 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11836 	const unsigned int pai = pa_index(pa);
11837 
11838 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11839 	pvh_lock(pai);
11840 	pv_entry_t **pvh = pai_to_pvh(pai);
11841 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11842 
11843 	if (__improbable(ppattr_pa_test_monitor(pa))) {
11844 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11845 	}
11846 
11847 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11848 		panic("%s: %#lx already locked down/executable (%#llx)",
11849 		    __func__, kva, (uint64_t)pvh_flags);
11850 	}
11851 
11852 
11853 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11854 
11855 	/* Update the physical aperture mapping to prevent kernel write access. */
11856 	const unsigned int new_xprr_perm =
11857 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11858 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11859 
11860 	pvh_unlock(pai);
11861 
11862 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11863 
11864 	/**
11865 	 * Double-check that the mapping didn't change physical addresses before the
11866 	 * LOCKDOWN flag was set (there is a brief window between the above
11867 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11868 	 *
11869 	 * This doesn't solve the ABA problem, but this doesn't have to since once
11870 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
11871 	 * page without the LOCKDOWN flag already set (so any future mappings can
11872 	 * only be RO, and no existing mappings can be removed).
11873 	 */
11874 	if (kvtophys_nofail(kva) != pa) {
11875 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11876 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11877 	}
11878 }
11879 
11880 /**
11881  * Helper for releasing a page from being locked down to the PPL, making it writable to the
11882  * kernel once again.
11883  *
11884  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11885  *       to unlockdown a page that was never locked down, will panic.
11886  *
11887  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
11888  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11889  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11890  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11891  *                     deviation will result in a panic.
11892  */
11893 MARK_AS_PMAP_TEXT static void
11894 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11895 {
11896 	pvh_assert_locked(pai);
11897 	pv_entry_t **pvh = pai_to_pvh(pai);
11898 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11899 
11900 	if (__improbable(!(pvh_flags & lockdown_flag))) {
11901 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11902 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11903 	}
11904 
11905 
11906 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11907 
11908 	/* Restore the pre-lockdown physical aperture mapping permissions. */
11909 	const unsigned int old_xprr_perm =
11910 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11911 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11912 }
11913 
11914 /**
11915  * Release a page from being locked down to the PPL, making it writable to the
11916  * kernel once again.
11917  *
11918  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11919  *       to unlockdown a page that was never locked down, will panic.
11920  *
11921  * @param kva Valid address to any mapping of the physical page to unlockdown.
11922  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11923  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11924  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11925  *                     deviation will result in a panic.
11926  */
11927 MARK_AS_PMAP_TEXT static void
11928 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11929 {
11930 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11931 	const unsigned int pai = pa_index(pa);
11932 
11933 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11934 	pvh_lock(pai);
11935 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11936 	pvh_unlock(pai);
11937 }
11938 
11939 #else /* XNU_MONITOR */
11940 
11941 void __unused
11942 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11943 {
11944 }
11945 
11946 void __unused
11947 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11948 {
11949 }
11950 
11951 #endif /* !XNU_MONITOR */
11952 
11953 
11954 MARK_AS_PMAP_TEXT static inline void
11955 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11956 {
11957 #if XNU_MONITOR
11958 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11959 #else
11960 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11961 #endif
11962 }
11963 
11964 MARK_AS_PMAP_TEXT static inline void
11965 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11966 {
11967 #if XNU_MONITOR
11968 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11969 #else
11970 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11971 #endif
11972 }
11973 
11974 /**
11975  * Perform basic validation checks on the destination only and
11976  * corresponding offset/sizes prior to writing to a read only allocation.
11977  *
11978  * @note Should be called before writing to an allocation from the read
11979  * only allocator.
11980  *
11981  * @param zid The ID of the zone the allocation belongs to.
11982  * @param va VA of element being modified (destination).
11983  * @param offset Offset being written to, in the element.
11984  * @param new_data_size Size of modification.
11985  *
11986  */
11987 
11988 MARK_AS_PMAP_TEXT static void
11989 pmap_ro_zone_validate_element_dst(
11990 	zone_id_t           zid,
11991 	vm_offset_t         va,
11992 	vm_offset_t         offset,
11993 	vm_size_t           new_data_size)
11994 {
11995 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11996 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11997 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11998 	}
11999 
12000 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12001 
12002 	/* Check element is from correct zone and properly aligned */
12003 	zone_require_ro(zid, elem_size, (void*)va);
12004 
12005 	if (__improbable(new_data_size > (elem_size - offset))) {
12006 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
12007 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12008 	}
12009 	if (__improbable(offset >= elem_size)) {
12010 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
12011 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12012 	}
12013 }
12014 
12015 
12016 /**
12017  * Perform basic validation checks on the source, destination and
12018  * corresponding offset/sizes prior to writing to a read only allocation.
12019  *
12020  * @note Should be called before writing to an allocation from the read
12021  * only allocator.
12022  *
12023  * @param zid The ID of the zone the allocation belongs to.
12024  * @param va VA of element being modified (destination).
12025  * @param offset Offset being written to, in the element.
12026  * @param new_data Pointer to new data (source).
12027  * @param new_data_size Size of modification.
12028  *
12029  */
12030 
12031 MARK_AS_PMAP_TEXT static void
12032 pmap_ro_zone_validate_element(
12033 	zone_id_t           zid,
12034 	vm_offset_t         va,
12035 	vm_offset_t         offset,
12036 	const vm_offset_t   new_data,
12037 	vm_size_t           new_data_size)
12038 {
12039 	vm_offset_t sum = 0;
12040 
12041 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12042 		panic("%s: Integer addition overflow %p + %lu = %lu",
12043 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12044 	}
12045 
12046 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12047 }
12048 
12049 /**
12050  * Ensure that physical page is locked down and pinned, before writing to it.
12051  *
12052  * @note Should be called before writing to an allocation from the read
12053  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12054  * ensure that it is called after the modification.
12055  *
12056  *
12057  * @param pa Physical address of the element being modified.
12058  * @param va Virtual address of element being modified.
12059  * @param size Size of the modification.
12060  *
12061  */
12062 
12063 MARK_AS_PMAP_TEXT static void
12064 pmap_ro_zone_lock_phy_page(
12065 	const pmap_paddr_t  pa,
12066 	vm_offset_t         va,
12067 	vm_size_t           size)
12068 {
12069 	const unsigned int pai = pa_index(pa);
12070 	pvh_lock(pai);
12071 
12072 	/* Ensure that the physical page is locked down */
12073 #if XNU_MONITOR
12074 	pv_entry_t **pvh = pai_to_pvh(pai);
12075 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12076 		panic("%s: Physical page not locked down %llx", __func__, pa);
12077 	}
12078 #endif /* XNU_MONITOR */
12079 
12080 	/* Ensure page can't become PPL-owned memory before the memcpy occurs */
12081 	pmap_pin_kernel_pages(va, size);
12082 }
12083 
12084 /**
12085  * Unlock and unpin physical page after writing to it.
12086  *
12087  * @note Should be called after writing to an allocation from the read
12088  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12089  * ensure that it has been called prior to the modification.
12090  *
12091  * @param pa Physical address of the element that was modified.
12092  * @param va Virtual address of element that was modified.
12093  * @param size Size of the modification.
12094  *
12095  */
12096 
12097 MARK_AS_PMAP_TEXT static void
12098 pmap_ro_zone_unlock_phy_page(
12099 	const pmap_paddr_t  pa,
12100 	vm_offset_t         va,
12101 	vm_size_t           size)
12102 {
12103 	const unsigned int pai = pa_index(pa);
12104 	pmap_unpin_kernel_pages(va, size);
12105 	pvh_unlock(pai);
12106 }
12107 
12108 /**
12109  * Function to copy kauth_cred from new_data to kv.
12110  * Function defined in "kern_prot.c"
12111  *
12112  * @note Will be removed upon completion of
12113  * <rdar://problem/72635194> Compiler PAC support for memcpy.
12114  *
12115  * @param kv Address to copy new data to.
12116  * @param new_data Pointer to new data.
12117  *
12118  */
12119 
12120 extern void
12121 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12122 
12123 /**
12124  * Zalloc-specific memcpy that writes through the physical aperture
12125  * and ensures the element being modified is from a read-only zone.
12126  *
12127  * @note Designed to work only with the zone allocator's read-only submap.
12128  *
12129  * @param zid The ID of the zone to allocate from.
12130  * @param va VA of element to be modified.
12131  * @param offset Offset from element.
12132  * @param new_data Pointer to new data.
12133  * @param new_data_size	Size of modification.
12134  *
12135  */
12136 
12137 void
12138 pmap_ro_zone_memcpy(
12139 	zone_id_t           zid,
12140 	vm_offset_t         va,
12141 	vm_offset_t         offset,
12142 	const vm_offset_t   new_data,
12143 	vm_size_t           new_data_size)
12144 {
12145 #if XNU_MONITOR
12146 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12147 #else /* XNU_MONITOR */
12148 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12149 #endif /* XNU_MONITOR */
12150 }
12151 
12152 MARK_AS_PMAP_TEXT void
12153 pmap_ro_zone_memcpy_internal(
12154 	zone_id_t             zid,
12155 	vm_offset_t           va,
12156 	vm_offset_t           offset,
12157 	const vm_offset_t     new_data,
12158 	vm_size_t             new_data_size)
12159 {
12160 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12161 
12162 	if (!new_data || new_data_size == 0) {
12163 		return;
12164 	}
12165 
12166 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12167 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12168 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12169 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12170 }
12171 
12172 /**
12173  * Zalloc-specific function to atomically mutate fields of an element that
12174  * belongs to a read-only zone, via the physcial aperture.
12175  *
12176  * @note Designed to work only with the zone allocator's read-only submap.
12177  *
12178  * @param zid The ID of the zone the element belongs to.
12179  * @param va VA of element to be modified.
12180  * @param offset Offset in element.
12181  * @param op Atomic operation to perform.
12182  * @param value	Mutation value.
12183  *
12184  */
12185 
12186 uint64_t
12187 pmap_ro_zone_atomic_op(
12188 	zone_id_t             zid,
12189 	vm_offset_t           va,
12190 	vm_offset_t           offset,
12191 	zro_atomic_op_t       op,
12192 	uint64_t              value)
12193 {
12194 #if XNU_MONITOR
12195 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12196 #else /* XNU_MONITOR */
12197 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12198 #endif /* XNU_MONITOR */
12199 }
12200 
12201 MARK_AS_PMAP_TEXT uint64_t
12202 pmap_ro_zone_atomic_op_internal(
12203 	zone_id_t             zid,
12204 	vm_offset_t           va,
12205 	vm_offset_t           offset,
12206 	zro_atomic_op_t       op,
12207 	uint64_t              value)
12208 {
12209 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12210 	vm_size_t value_size = op & 0xf;
12211 
12212 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12213 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
12214 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12215 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12216 
12217 	return value;
12218 }
12219 
12220 /**
12221  * bzero for allocations from read only zones, that writes through the
12222  * physical aperture.
12223  *
12224  * @note This is called by the zfree path of all allocations from read
12225  * only zones.
12226  *
12227  * @param zid The ID of the zone the allocation belongs to.
12228  * @param va VA of element to be zeroed.
12229  * @param offset Offset in the element.
12230  * @param size	Size of allocation.
12231  *
12232  */
12233 
12234 void
12235 pmap_ro_zone_bzero(
12236 	zone_id_t       zid,
12237 	vm_offset_t     va,
12238 	vm_offset_t     offset,
12239 	vm_size_t       size)
12240 {
12241 #if XNU_MONITOR
12242 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12243 #else /* XNU_MONITOR */
12244 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
12245 #endif /* XNU_MONITOR */
12246 }
12247 
12248 MARK_AS_PMAP_TEXT void
12249 pmap_ro_zone_bzero_internal(
12250 	zone_id_t       zid,
12251 	vm_offset_t     va,
12252 	vm_offset_t     offset,
12253 	vm_size_t       size)
12254 {
12255 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12256 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12257 	pmap_ro_zone_lock_phy_page(pa, va, size);
12258 	bzero((void*)phystokv(pa), size);
12259 	pmap_ro_zone_unlock_phy_page(pa, va, size);
12260 }
12261 
12262 /**
12263  * Removes write access from the Physical Aperture.
12264  *
12265  * @note For non-PPL devices, it simply makes all virtual mappings RO.
12266  * @note Designed to work only with the zone allocator's read-only submap.
12267  *
12268  * @param va VA of the page to restore write access to.
12269  *
12270  */
12271 MARK_AS_PMAP_TEXT static void
12272 pmap_phys_write_disable(vm_address_t va)
12273 {
12274 #if XNU_MONITOR
12275 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12276 #else /* XNU_MONITOR */
12277 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12278 #endif /* XNU_MONITOR */
12279 }
12280 
12281 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
12282 
12283 MARK_AS_PMAP_TEXT mach_vm_size_t
12284 pmap_query_resident_internal(
12285 	pmap_t                  pmap,
12286 	vm_map_address_t        start,
12287 	vm_map_address_t        end,
12288 	mach_vm_size_t          *compressed_bytes_p)
12289 {
12290 	mach_vm_size_t  resident_bytes = 0;
12291 	mach_vm_size_t  compressed_bytes = 0;
12292 
12293 	pt_entry_t     *bpte, *epte;
12294 	pt_entry_t     *pte_p;
12295 	tt_entry_t     *tte_p;
12296 
12297 	if (pmap == NULL) {
12298 		return PMAP_RESIDENT_INVALID;
12299 	}
12300 
12301 	validate_pmap(pmap);
12302 
12303 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12304 
12305 	/* Ensure that this request is valid, and addresses exactly one TTE. */
12306 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12307 	    (end % pt_attr_page_size(pt_attr)))) {
12308 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12309 	}
12310 
12311 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12312 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12313 	}
12314 
12315 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12316 	tte_p = pmap_tte(pmap, start);
12317 	if (tte_p == (tt_entry_t *) NULL) {
12318 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
12319 		return PMAP_RESIDENT_INVALID;
12320 	}
12321 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12322 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
12323 		bpte = &pte_p[pte_index(pt_attr, start)];
12324 		epte = &pte_p[pte_index(pt_attr, end)];
12325 
12326 		for (; bpte < epte; bpte++) {
12327 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12328 				compressed_bytes += pt_attr_page_size(pt_attr);
12329 			} else if (pa_valid(pte_to_pa(*bpte))) {
12330 				resident_bytes += pt_attr_page_size(pt_attr);
12331 			}
12332 		}
12333 	}
12334 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12335 
12336 	if (compressed_bytes_p) {
12337 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12338 		*compressed_bytes_p += compressed_bytes;
12339 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12340 	}
12341 
12342 	return resident_bytes;
12343 }
12344 
12345 mach_vm_size_t
12346 pmap_query_resident(
12347 	pmap_t                  pmap,
12348 	vm_map_address_t        start,
12349 	vm_map_address_t        end,
12350 	mach_vm_size_t          *compressed_bytes_p)
12351 {
12352 	mach_vm_size_t          total_resident_bytes;
12353 	mach_vm_size_t          compressed_bytes;
12354 	vm_map_address_t        va;
12355 
12356 
12357 	if (pmap == PMAP_NULL) {
12358 		if (compressed_bytes_p) {
12359 			*compressed_bytes_p = 0;
12360 		}
12361 		return 0;
12362 	}
12363 
12364 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12365 
12366 	total_resident_bytes = 0;
12367 	compressed_bytes = 0;
12368 
12369 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12370 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12371 	    VM_KERNEL_ADDRHIDE(end));
12372 
12373 	va = start;
12374 	while (va < end) {
12375 		vm_map_address_t l;
12376 		mach_vm_size_t resident_bytes;
12377 
12378 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12379 
12380 		if (l > end) {
12381 			l = end;
12382 		}
12383 #if XNU_MONITOR
12384 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12385 #else
12386 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12387 #endif
12388 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12389 			break;
12390 		}
12391 
12392 		total_resident_bytes += resident_bytes;
12393 
12394 		va = l;
12395 	}
12396 
12397 	if (compressed_bytes_p) {
12398 		*compressed_bytes_p = compressed_bytes;
12399 	}
12400 
12401 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12402 	    total_resident_bytes);
12403 
12404 	return total_resident_bytes;
12405 }
12406 
12407 #if MACH_ASSERT
12408 static void
12409 pmap_check_ledgers(
12410 	pmap_t pmap)
12411 {
12412 	int     pid;
12413 	char    *procname;
12414 
12415 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12416 		/*
12417 		 * This pmap was not or is no longer fully associated
12418 		 * with a task (e.g. the old pmap after a fork()/exec() or
12419 		 * spawn()).  Its "ledger" still points at a task that is
12420 		 * now using a different (and active) address space, so
12421 		 * we can't check that all the pmap ledgers are balanced here.
12422 		 *
12423 		 * If the "pid" is set, that means that we went through
12424 		 * pmap_set_process() in task_terminate_internal(), so
12425 		 * this task's ledger should not have been re-used and
12426 		 * all the pmap ledgers should be back to 0.
12427 		 */
12428 		return;
12429 	}
12430 
12431 	pid = pmap->pmap_pid;
12432 	procname = pmap->pmap_procname;
12433 
12434 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12435 }
12436 #endif /* MACH_ASSERT */
12437 
12438 void
12439 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12440 {
12441 }
12442 
12443 /**
12444  * The minimum shared region nesting size is used by the VM to determine when to
12445  * break up large mappings to nested regions. The smallest size that these
12446  * mappings can be broken into is determined by what page table level those
12447  * regions are being nested in at and the size of the page tables.
12448  *
12449  * For instance, if a nested region is nesting at L2 for a process utilizing
12450  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12451  * block entry).
12452  *
12453  * @param pmap The target pmap to determine the block size based on whether it's
12454  *             using 16KB or 4KB page tables.
12455  */
12456 uint64_t
12457 pmap_shared_region_size_min(__unused pmap_t pmap)
12458 {
12459 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12460 
12461 	/**
12462 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12463 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12464 	 * point to shared L3 page tables in the shared region pmap.
12465 	 */
12466 	return pt_attr_twig_size(pt_attr);
12467 }
12468 
12469 boolean_t
12470 pmap_enforces_execute_only(
12471 	pmap_t pmap)
12472 {
12473 	return pmap != kernel_pmap;
12474 }
12475 
12476 MARK_AS_PMAP_TEXT void
12477 pmap_set_vm_map_cs_enforced_internal(
12478 	pmap_t pmap,
12479 	bool new_value)
12480 {
12481 	validate_pmap_mutable(pmap);
12482 	pmap->pmap_vm_map_cs_enforced = new_value;
12483 }
12484 
12485 void
12486 pmap_set_vm_map_cs_enforced(
12487 	pmap_t pmap,
12488 	bool new_value)
12489 {
12490 #if XNU_MONITOR
12491 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12492 #else
12493 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12494 #endif
12495 }
12496 
12497 extern int cs_process_enforcement_enable;
12498 bool
12499 pmap_get_vm_map_cs_enforced(
12500 	pmap_t pmap)
12501 {
12502 	if (cs_process_enforcement_enable) {
12503 		return true;
12504 	}
12505 	return pmap->pmap_vm_map_cs_enforced;
12506 }
12507 
12508 MARK_AS_PMAP_TEXT void
12509 pmap_set_jit_entitled_internal(
12510 	__unused pmap_t pmap)
12511 {
12512 	return;
12513 }
12514 
12515 void
12516 pmap_set_jit_entitled(
12517 	pmap_t pmap)
12518 {
12519 #if XNU_MONITOR
12520 	pmap_set_jit_entitled_ppl(pmap);
12521 #else
12522 	pmap_set_jit_entitled_internal(pmap);
12523 #endif
12524 }
12525 
12526 bool
12527 pmap_get_jit_entitled(
12528 	__unused pmap_t pmap)
12529 {
12530 	return false;
12531 }
12532 
12533 MARK_AS_PMAP_TEXT void
12534 pmap_set_tpro_internal(
12535 	__unused pmap_t pmap)
12536 {
12537 	return;
12538 }
12539 
12540 void
12541 pmap_set_tpro(
12542 	pmap_t pmap)
12543 {
12544 #if XNU_MONITOR
12545 	pmap_set_tpro_ppl(pmap);
12546 #else /* XNU_MONITOR */
12547 	pmap_set_tpro_internal(pmap);
12548 #endif /* XNU_MONITOR */
12549 }
12550 
12551 bool
12552 pmap_get_tpro(
12553 	__unused pmap_t pmap)
12554 {
12555 	return false;
12556 }
12557 
12558 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12559 
12560 MARK_AS_PMAP_TEXT kern_return_t
12561 pmap_query_page_info_internal(
12562 	pmap_t          pmap,
12563 	vm_map_offset_t va,
12564 	int             *disp_p)
12565 {
12566 	pmap_paddr_t    pa;
12567 	int             disp;
12568 	unsigned int    pai;
12569 	pt_entry_t      *pte_p, pte;
12570 	pv_entry_t      **pv_h, *pve_p;
12571 
12572 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12573 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12574 		*disp_p = 0;
12575 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12576 		return KERN_INVALID_ARGUMENT;
12577 	}
12578 
12579 	validate_pmap(pmap);
12580 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12581 
12582 try_again:
12583 	disp = 0;
12584 	pte_p = pmap_pte(pmap, va);
12585 	if (pte_p == PT_ENTRY_NULL) {
12586 		goto done;
12587 	}
12588 	pte = *(volatile pt_entry_t*)pte_p;
12589 	pa = pte_to_pa(pte);
12590 	if (pa == 0) {
12591 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12592 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12593 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12594 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12595 			}
12596 		}
12597 	} else {
12598 		disp |= PMAP_QUERY_PAGE_PRESENT;
12599 		pai = pa_index(pa);
12600 		if (!pa_valid(pa)) {
12601 			goto done;
12602 		}
12603 		pvh_lock(pai);
12604 		if (pte != *(volatile pt_entry_t*)pte_p) {
12605 			/* something changed: try again */
12606 			pvh_unlock(pai);
12607 			pmap_query_page_info_retries++;
12608 			goto try_again;
12609 		}
12610 		pv_h = pai_to_pvh(pai);
12611 		pve_p = PV_ENTRY_NULL;
12612 		int pve_ptep_idx = 0;
12613 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12614 			pve_p = pvh_pve_list(pv_h);
12615 			while (pve_p != PV_ENTRY_NULL &&
12616 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12617 				pve_p = pve_next(pve_p);
12618 			}
12619 		}
12620 
12621 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12622 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12623 		} else if (ppattr_test_reusable(pai)) {
12624 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12625 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12626 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12627 		}
12628 		pvh_unlock(pai);
12629 	}
12630 
12631 done:
12632 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12633 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12634 	*disp_p = disp;
12635 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12636 	return KERN_SUCCESS;
12637 }
12638 
12639 kern_return_t
12640 pmap_query_page_info(
12641 	pmap_t          pmap,
12642 	vm_map_offset_t va,
12643 	int             *disp_p)
12644 {
12645 #if XNU_MONITOR
12646 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12647 #else
12648 	return pmap_query_page_info_internal(pmap, va, disp_p);
12649 #endif
12650 }
12651 
12652 
12653 
12654 uint32_t
12655 pmap_user_va_bits(pmap_t pmap __unused)
12656 {
12657 #if __ARM_MIXED_PAGE_SIZE__
12658 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12659 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12660 #else
12661 	return 64 - T0SZ_BOOT;
12662 #endif
12663 }
12664 
12665 uint32_t
12666 pmap_kernel_va_bits(void)
12667 {
12668 	return 64 - T1SZ_BOOT;
12669 }
12670 
12671 static vm_map_size_t
12672 pmap_user_va_size(pmap_t pmap)
12673 {
12674 	return 1ULL << pmap_user_va_bits(pmap);
12675 }
12676 
12677 
12678 
12679 bool
12680 pmap_in_ppl(void)
12681 {
12682 	// Unsupported
12683 	return false;
12684 }
12685 
12686 __attribute__((__noreturn__))
12687 void
12688 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12689 {
12690 	panic("%s called on an unsupported platform.", __FUNCTION__);
12691 }
12692 
12693 void *
12694 pmap_claim_reserved_ppl_page(void)
12695 {
12696 	// Unsupported
12697 	return NULL;
12698 }
12699 
12700 void
12701 pmap_free_reserved_ppl_page(void __unused *kva)
12702 {
12703 	// Unsupported
12704 }
12705 
12706 
12707 #if PMAP_CS_PPL_MONITOR
12708 
12709 /* Immutable part of the trust cache runtime */
12710 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12711 
12712 /* Mutable part of the trust cache runtime */
12713 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12714 
12715 /* Lock for the trust cache runtime */
12716 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12717 
12718 MARK_AS_PMAP_TEXT kern_return_t
12719 pmap_check_trust_cache_runtime_for_uuid_internal(
12720 	const uint8_t check_uuid[kUUIDSize])
12721 {
12722 	kern_return_t ret = KERN_DENIED;
12723 
12724 	if (amfi->TrustCache.version < 3) {
12725 		/* AMFI change hasn't landed in the build */
12726 		pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12727 		return KERN_NOT_SUPPORTED;
12728 	}
12729 
12730 	/* Lock the runtime as shared */
12731 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12732 
12733 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12734 		&ppl_trust_cache_rt,
12735 		check_uuid,
12736 		NULL);
12737 
12738 	/* Unlock the runtime */
12739 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12740 
12741 	if (tc_ret.error == kTCReturnSuccess) {
12742 		ret = KERN_SUCCESS;
12743 	} else if (tc_ret.error == kTCReturnNotFound) {
12744 		ret = KERN_NOT_FOUND;
12745 	} else {
12746 		ret = KERN_FAILURE;
12747 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12748 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12749 	}
12750 
12751 	return ret;
12752 }
12753 
12754 kern_return_t
12755 pmap_check_trust_cache_runtime_for_uuid(
12756 	const uint8_t check_uuid[kUUIDSize])
12757 {
12758 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12759 }
12760 
12761 MARK_AS_PMAP_TEXT kern_return_t
12762 pmap_load_trust_cache_with_type_internal(
12763 	TCType_t type,
12764 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12765 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12766 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12767 {
12768 	kern_return_t ret = KERN_DENIED;
12769 	pmap_img4_payload_t *payload = NULL;
12770 	size_t img4_payload_len = 0;
12771 	size_t payload_len_aligned = 0;
12772 	size_t manifest_len_aligned = 0;
12773 
12774 	/* Ignore the auxiliary manifest until we add support for it */
12775 	(void)img4_aux_manifest;
12776 	(void)img4_aux_manifest_len;
12777 
12778 
12779 #if PMAP_CS_INCLUDE_CODE_SIGNING
12780 	if (pmap_cs) {
12781 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12782 			panic("trust cache type not loadable from interface: %u", type);
12783 		} else if (type >= kTCTypeTotal) {
12784 			panic("attempted to load an unsupported trust cache type: %u", type);
12785 		}
12786 
12787 		/* Validate entitlement for the calling process */
12788 		if (TCTypeConfig[type].entitlementValue != NULL) {
12789 			const bool entitlement_satisfied = check_entitlement_pmap(
12790 				NULL,
12791 				"com.apple.private.pmap.load-trust-cache",
12792 				TCTypeConfig[type].entitlementValue,
12793 				false,
12794 				true);
12795 
12796 			if (entitlement_satisfied == false) {
12797 				panic("attempted to load trust cache without entitlement: %u", type);
12798 			}
12799 		}
12800 	}
12801 #endif
12802 
12803 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12804 	ret = pmap_reserve_ppl_page();
12805 	if (ret != KERN_SUCCESS) {
12806 		if (ret != KERN_RESOURCE_SHORTAGE) {
12807 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12808 		}
12809 		return ret;
12810 	}
12811 
12812 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
12813 	payload_len_aligned = round_page(pmap_img4_payload_len);
12814 	manifest_len_aligned = round_page(img4_manifest_len);
12815 
12816 	/* Ensure we have valid data passed in */
12817 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12818 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12819 
12820 	/*
12821 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12822 	 * data structure used by libTrustCache to manage the payload. We need to be able to
12823 	 * write to that data structure, so we keep the payload PPL writable.
12824 	 */
12825 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12826 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12827 
12828 	/* Should be safe to read from this now */
12829 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
12830 
12831 	/* Acquire a writable version of the trust cache data structure */
12832 	TrustCache_t *trust_cache = &payload->trust_cache;
12833 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12834 
12835 	/* Calculate the correct length of the img4 payload */
12836 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12837 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12838 	}
12839 
12840 	/* Exclusively lock the runtime */
12841 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12842 
12843 	/* Load the trust cache */
12844 	TCReturn_t tc_ret = amfi->TrustCache.load(
12845 		&ppl_trust_cache_rt,
12846 		type,
12847 		trust_cache,
12848 		(const uintptr_t)payload->img4_payload, img4_payload_len,
12849 		(const uintptr_t)img4_manifest, img4_manifest_len);
12850 
12851 	/* Unlock the runtime */
12852 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12853 
12854 	if (tc_ret.error == kTCReturnSuccess) {
12855 		ret = KERN_SUCCESS;
12856 	} else {
12857 		if (tc_ret.error == kTCReturnDuplicate) {
12858 			ret = KERN_ALREADY_IN_SET;
12859 		} else {
12860 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12861 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12862 
12863 			ret = KERN_FAILURE;
12864 		}
12865 
12866 		/* Unlock the payload data */
12867 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12868 		trust_cache = NULL;
12869 		payload = NULL;
12870 	}
12871 
12872 	/* Unlock the manifest since it is no longer needed */
12873 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12874 
12875 	/* Return the CoreCrypto reserved page back to the free list */
12876 	pmap_release_reserved_ppl_page();
12877 
12878 	return ret;
12879 }
12880 
12881 kern_return_t
12882 pmap_load_trust_cache_with_type(
12883 	TCType_t type,
12884 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12885 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12886 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12887 {
12888 	kern_return_t ret = KERN_DENIED;
12889 
12890 	ret = pmap_load_trust_cache_with_type_ppl(
12891 		type,
12892 		pmap_img4_payload, pmap_img4_payload_len,
12893 		img4_manifest, img4_manifest_len,
12894 		img4_aux_manifest, img4_aux_manifest_len);
12895 
12896 	while (ret == KERN_RESOURCE_SHORTAGE) {
12897 		/* Allocate a page from the free list */
12898 		pmap_alloc_page_for_ppl(0);
12899 
12900 		/* Attempt the call again */
12901 		ret = pmap_load_trust_cache_with_type_ppl(
12902 			type,
12903 			pmap_img4_payload, pmap_img4_payload_len,
12904 			img4_manifest, img4_manifest_len,
12905 			img4_aux_manifest, img4_aux_manifest_len);
12906 	}
12907 
12908 	return ret;
12909 }
12910 
12911 MARK_AS_PMAP_TEXT kern_return_t
12912 pmap_query_trust_cache_safe(
12913 	TCQueryType_t query_type,
12914 	const uint8_t cdhash[kTCEntryHashSize],
12915 	TrustCacheQueryToken_t *query_token)
12916 {
12917 	kern_return_t ret = KERN_NOT_FOUND;
12918 
12919 	/* Validate the query type preemptively */
12920 	if (query_type >= kTCQueryTypeTotal) {
12921 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12922 		return KERN_INVALID_ARGUMENT;
12923 	}
12924 
12925 	/* Lock the runtime as shared */
12926 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12927 
12928 	TCReturn_t tc_ret = amfi->TrustCache.query(
12929 		&ppl_trust_cache_rt,
12930 		query_type,
12931 		cdhash,
12932 		query_token);
12933 
12934 	/* Unlock the runtime */
12935 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12936 
12937 	if (tc_ret.error == kTCReturnSuccess) {
12938 		ret = KERN_SUCCESS;
12939 	} else if (tc_ret.error == kTCReturnNotFound) {
12940 		ret = KERN_NOT_FOUND;
12941 	} else {
12942 		ret = KERN_FAILURE;
12943 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
12944 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12945 	}
12946 
12947 	return ret;
12948 }
12949 
12950 MARK_AS_PMAP_TEXT kern_return_t
12951 pmap_query_trust_cache_internal(
12952 	TCQueryType_t query_type,
12953 	const uint8_t cdhash[kTCEntryHashSize],
12954 	TrustCacheQueryToken_t *query_token)
12955 {
12956 	kern_return_t ret = KERN_NOT_FOUND;
12957 	TrustCacheQueryToken_t query_token_safe = {0};
12958 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
12959 
12960 	/* Copy in the CDHash into PPL storage */
12961 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
12962 
12963 	/* Query through the safe API since we're in the PPL now */
12964 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
12965 
12966 	if (query_token != NULL) {
12967 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12968 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
12969 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12970 	}
12971 
12972 	return ret;
12973 }
12974 
12975 kern_return_t
12976 pmap_query_trust_cache(
12977 	TCQueryType_t query_type,
12978 	const uint8_t cdhash[kTCEntryHashSize],
12979 	TrustCacheQueryToken_t *query_token)
12980 {
12981 	kern_return_t ret = KERN_NOT_FOUND;
12982 
12983 	ret = pmap_query_trust_cache_ppl(
12984 		query_type,
12985 		cdhash,
12986 		query_token);
12987 
12988 	return ret;
12989 }
12990 
12991 MARK_AS_PMAP_DATA bool ppl_developer_mode_set =  false;
12992 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
12993 
12994 MARK_AS_PMAP_TEXT void
12995 pmap_toggle_developer_mode_internal(
12996 	bool state)
12997 {
12998 	bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
12999 
13000 	/*
13001 	 * Only the following state transitions are allowed:
13002 	 * -- not set --> false
13003 	 * -- not set --> true
13004 	 * -- true --> false
13005 	 * -- true --> true
13006 	 * -- false --> false
13007 	 *
13008 	 * We never allow false --> true transitions.
13009 	 */
13010 	bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
13011 
13012 	if ((current == false) && (state == true) && state_set) {
13013 		panic("PMAP_CS: attempted to enable developer mode incorrectly");
13014 	}
13015 
13016 	/* We're going to update the developer mode state, so update this first */
13017 	os_atomic_store(&ppl_developer_mode_set, true, relaxed);
13018 
13019 	/* Update the developer mode state on the system */
13020 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13021 }
13022 
13023 void
13024 pmap_toggle_developer_mode(
13025 	bool state)
13026 {
13027 	pmap_toggle_developer_mode_ppl(state);
13028 }
13029 
13030 #endif /* PMAP_CS_PPL_MONITOR */
13031 
13032 #if PMAP_CS_INCLUDE_CODE_SIGNING
13033 
13034 static int
13035 pmap_cs_profiles_rbtree_compare(
13036 	void *profile0,
13037 	void *profile1)
13038 {
13039 	if (profile0 < profile1) {
13040 		return -1;
13041 	} else if (profile0 > profile1) {
13042 		return 1;
13043 	}
13044 	return 0;
13045 }
13046 
13047 /* Red-black tree for managing provisioning profiles */
13048 MARK_AS_PMAP_DATA static
13049 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13050 
13051 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13052 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13053 
13054 /* Lock for the profile red-black tree */
13055 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13056 
13057 void
13058 pmap_initialize_provisioning_profiles(void)
13059 {
13060 	/* Initialize the profiles red-black tree lock */
13061 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13062 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13063 
13064 	/* Initialize the red-black tree itself */
13065 	RB_INIT(&pmap_cs_registered_profiles);
13066 
13067 	printf("initialized PPL provisioning profile data\n");
13068 }
13069 
13070 static bool
13071 pmap_is_testflight_profile(
13072 	pmap_cs_profile_t *profile_obj)
13073 {
13074 	const char *entitlement_name = "beta-reports-active";
13075 	const size_t entitlement_length = strlen(entitlement_name);
13076 	CEQueryOperation_t query[2] = {0};
13077 
13078 	/* If the profile provisions no entitlements, then it isn't a test flight one */
13079 	if (profile_obj->entitlements_ctx == NULL) {
13080 		return false;
13081 	}
13082 
13083 	/* Build our CoreEntitlements query */
13084 	query[0].opcode = kCEOpSelectKey;
13085 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13086 	query[0].parameters.stringParameter.length = entitlement_length;
13087 	query[1] = CEMatchBool(true);
13088 
13089 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13090 		profile_obj->entitlements_ctx,
13091 		query, 2);
13092 
13093 	if (ce_err == amfi->CoreEntitlements.kNoError) {
13094 		return true;
13095 	}
13096 
13097 	return false;
13098 }
13099 
13100 static bool
13101 pmap_is_development_profile(
13102 	pmap_cs_profile_t *profile_obj)
13103 {
13104 	/* Check for UPP */
13105 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13106 		*profile_obj->profile_ctx,
13107 		CESelectDictValue("ProvisionsAllDevices"));
13108 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13109 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13110 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13111 			return false;
13112 		}
13113 	}
13114 
13115 	/* Check for TestFlight profile */
13116 	if (pmap_is_testflight_profile(profile_obj) == true) {
13117 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13118 		return false;
13119 	}
13120 
13121 	pmap_cs_log_info("%p: development profile", profile_obj);
13122 	return true;
13123 }
13124 
13125 static kern_return_t
13126 pmap_initialize_profile_entitlements(
13127 	pmap_cs_profile_t *profile_obj)
13128 {
13129 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13130 		*profile_obj->profile_ctx,
13131 		CESelectDictValue("Entitlements"));
13132 
13133 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13134 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13135 		profile_obj->entitlements_ctx = NULL;
13136 
13137 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13138 		return KERN_NOT_FOUND;
13139 	}
13140 
13141 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13142 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13143 
13144 	CEValidationResult ce_result = {0};
13145 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
13146 		pmap_cs_core_entitlements_runtime,
13147 		&ce_result,
13148 		der_start, der_end);
13149 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13150 		pmap_cs_log_error("unable to validate profile entitlements: %s",
13151 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13152 
13153 		return KERN_ABORTED;
13154 	}
13155 
13156 	struct CEQueryContext query_ctx = {0};
13157 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13158 		pmap_cs_core_entitlements_runtime,
13159 		ce_result,
13160 		&query_ctx);
13161 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13162 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13163 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13164 
13165 		return KERN_ABORTED;
13166 	}
13167 
13168 	/* Setup the entitlements context within the profile object */
13169 	profile_obj->entitlements_ctx_storage = query_ctx;
13170 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13171 
13172 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13173 	return KERN_SUCCESS;
13174 }
13175 
13176 kern_return_t
13177 pmap_register_provisioning_profile_internal(
13178 	const vm_address_t payload_addr,
13179 	const vm_size_t payload_size)
13180 {
13181 	kern_return_t ret = KERN_DENIED;
13182 	pmap_cs_profile_t *profile_obj = NULL;
13183 	pmap_profile_payload_t *profile_payload = NULL;
13184 	vm_size_t max_profile_blob_size = 0;
13185 	const uint8_t *profile_content = NULL;
13186 	size_t profile_content_length = 0;
13187 
13188 
13189 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
13190 	ret = pmap_reserve_ppl_page();
13191 	if (ret != KERN_SUCCESS) {
13192 		if (ret != KERN_RESOURCE_SHORTAGE) {
13193 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13194 		}
13195 		return ret;
13196 	}
13197 
13198 	/* Ensure we have valid data passed in */
13199 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13200 
13201 	/*
13202 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
13203 	 * data structure used by the PPL to manage the payload. We need to be able to write
13204 	 * to that data structure, so we keep the payload PPL writable.
13205 	 */
13206 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13207 
13208 	/* Should be safe to read from this now */
13209 	profile_payload = (pmap_profile_payload_t*)payload_addr;
13210 
13211 	/* Ensure the profile blob size provided is valid */
13212 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13213 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13214 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13215 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13216 	}
13217 
13218 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13219 	const bool allow_development_root_cert = true;
13220 #else
13221 	const bool allow_development_root_cert = false;
13222 #endif
13223 
13224 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
13225 		profile_payload->profile_blob, profile_payload->profile_blob_size,
13226 		allow_development_root_cert,
13227 		&profile_content, &profile_content_length);
13228 
13229 	/* Release the PPL page allocated for CoreCrypto */
13230 	pmap_release_reserved_ppl_page();
13231 
13232 	if (ct_result != 0) {
13233 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13234 	} else if ((profile_content == NULL) || profile_content_length == 0) {
13235 		panic("PMAP_CS: profile does not have any content: %p | %lu",
13236 		    profile_content, profile_content_length);
13237 	}
13238 
13239 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13240 		pmap_cs_core_entitlements_runtime,
13241 		CCDER_CONSTRUCTED_SET,
13242 		false,
13243 		profile_content, profile_content + profile_content_length);
13244 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13245 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13246 	}
13247 
13248 	/* Acquire a writable version of the profile data structure */
13249 	profile_obj = &profile_payload->profile_obj_storage;
13250 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13251 
13252 	profile_obj->original_payload = profile_payload;
13253 	profile_obj->profile_ctx_storage = profile_ctx_storage;
13254 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13255 	os_atomic_store(&profile_obj->reference_count, 0, release);
13256 
13257 	/* Setup the entitlements provisioned by the profile */
13258 	ret = pmap_initialize_profile_entitlements(profile_obj);
13259 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13260 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13261 	}
13262 
13263 	/* Setup properties of the profile */
13264 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13265 
13266 	/* Mark as validated since it passed all checks */
13267 	profile_obj->profile_validated = true;
13268 
13269 	/* Add the profile to the red-black tree */
13270 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13271 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13272 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13273 	}
13274 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13275 
13276 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13277 	return KERN_SUCCESS;
13278 }
13279 
13280 kern_return_t
13281 pmap_register_provisioning_profile(
13282 	const vm_address_t payload_addr,
13283 	const vm_size_t payload_size)
13284 {
13285 	kern_return_t ret = KERN_DENIED;
13286 
13287 	ret = pmap_register_provisioning_profile_ppl(
13288 		payload_addr,
13289 		payload_size);
13290 
13291 	while (ret == KERN_RESOURCE_SHORTAGE) {
13292 		/* Allocate a page from the free list */
13293 		pmap_alloc_page_for_ppl(0);
13294 
13295 		/* Attempt the call again */
13296 		ret = pmap_register_provisioning_profile_ppl(
13297 			payload_addr,
13298 			payload_size);
13299 	}
13300 
13301 	return ret;
13302 }
13303 
13304 kern_return_t
13305 pmap_unregister_provisioning_profile_internal(
13306 	pmap_cs_profile_t *profile_obj)
13307 {
13308 	kern_return_t ret = KERN_DENIED;
13309 
13310 	/* Lock the red-black tree exclusively */
13311 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13312 
13313 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13314 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13315 	}
13316 
13317 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13318 	if (reference_count != 0) {
13319 		ret = KERN_FAILURE;
13320 		goto exit;
13321 	}
13322 
13323 	/* Remove the profile from the red-black tree */
13324 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13325 
13326 	/* Unregistration was a success */
13327 	ret = KERN_SUCCESS;
13328 
13329 exit:
13330 	/* Unlock the red-black tree */
13331 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13332 
13333 	if (ret == KERN_SUCCESS) {
13334 		/* Get the original payload address */
13335 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13336 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13337 
13338 		/* Get the original payload size */
13339 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13340 		payload_size = round_page(payload_size);
13341 
13342 		/* Unlock the profile payload */
13343 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13344 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13345 		    profile_payload, payload_size);
13346 
13347 		profile_obj = NULL;
13348 	}
13349 	return ret;
13350 }
13351 
13352 kern_return_t
13353 pmap_unregister_provisioning_profile(
13354 	pmap_cs_profile_t *profile_obj)
13355 {
13356 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13357 }
13358 
13359 kern_return_t
13360 pmap_associate_provisioning_profile_internal(
13361 	pmap_cs_code_directory_t *cd_entry,
13362 	pmap_cs_profile_t *profile_obj)
13363 {
13364 	kern_return_t ret = KERN_DENIED;
13365 
13366 	/* Acquire the lock on the code directory */
13367 	pmap_cs_lock_code_directory(cd_entry);
13368 
13369 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13370 		pmap_cs_log_error("disallowing profile association with verified signature");
13371 		goto exit;
13372 	} else if (cd_entry->profile_obj != NULL) {
13373 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13374 		goto exit;
13375 	}
13376 
13377 	/* Lock the red-black tree as shared */
13378 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13379 
13380 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13381 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13382 	} else if (profile_obj->profile_validated == false) {
13383 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13384 	}
13385 
13386 	/* Associate the profile with the signature */
13387 	cd_entry->profile_obj = profile_obj;
13388 
13389 	/* Increment the reference count on the profile object */
13390 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13391 	if (reference_count == 0) {
13392 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13393 	}
13394 
13395 	/* Unlock the red-black tree */
13396 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13397 
13398 	/* Association was a success */
13399 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13400 	ret = KERN_SUCCESS;
13401 
13402 exit:
13403 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13404 
13405 	return ret;
13406 }
13407 
13408 kern_return_t
13409 pmap_associate_provisioning_profile(
13410 	pmap_cs_code_directory_t *cd_entry,
13411 	pmap_cs_profile_t *profile_obj)
13412 {
13413 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13414 }
13415 
13416 kern_return_t
13417 pmap_disassociate_provisioning_profile_internal(
13418 	pmap_cs_code_directory_t *cd_entry)
13419 {
13420 	pmap_cs_profile_t *profile_obj = NULL;
13421 	kern_return_t ret = KERN_DENIED;
13422 
13423 	/* Acquire the lock on the code directory */
13424 	pmap_cs_lock_code_directory(cd_entry);
13425 
13426 	if (cd_entry->profile_obj == NULL) {
13427 		ret = KERN_NOT_FOUND;
13428 		goto exit;
13429 	}
13430 	profile_obj = cd_entry->profile_obj;
13431 
13432 	/* Disassociate the profile from the signature */
13433 	cd_entry->profile_obj = NULL;
13434 
13435 	/* Disassociation was a success */
13436 	ret = KERN_SUCCESS;
13437 
13438 exit:
13439 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13440 
13441 	if (ret == KERN_SUCCESS) {
13442 		/* Decrement the reference count on the profile object */
13443 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13444 		if (reference_count == UINT32_MAX) {
13445 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13446 		}
13447 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13448 	}
13449 	return ret;
13450 }
13451 
13452 kern_return_t
13453 pmap_disassociate_provisioning_profile(
13454 	pmap_cs_code_directory_t *cd_entry)
13455 {
13456 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13457 }
13458 
13459 kern_return_t
13460 pmap_associate_kernel_entitlements_internal(
13461 	pmap_cs_code_directory_t *cd_entry,
13462 	const void *kernel_entitlements)
13463 {
13464 	kern_return_t ret = KERN_DENIED;
13465 
13466 	if (kernel_entitlements == NULL) {
13467 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13468 	}
13469 
13470 	/* Acquire the lock on the code directory */
13471 	pmap_cs_lock_code_directory(cd_entry);
13472 
13473 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13474 		ret = KERN_DENIED;
13475 		goto out;
13476 	} else if (cd_entry->kernel_entitlements != NULL) {
13477 		ret = KERN_DENIED;
13478 		goto out;
13479 	}
13480 	cd_entry->kernel_entitlements = kernel_entitlements;
13481 
13482 	/* Association was a success */
13483 	ret = KERN_SUCCESS;
13484 
13485 out:
13486 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13487 	return ret;
13488 }
13489 
13490 kern_return_t
13491 pmap_associate_kernel_entitlements(
13492 	pmap_cs_code_directory_t *cd_entry,
13493 	const void *kernel_entitlements)
13494 {
13495 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13496 }
13497 
13498 kern_return_t
13499 pmap_resolve_kernel_entitlements_internal(
13500 	pmap_t pmap,
13501 	const void **kernel_entitlements)
13502 {
13503 	const void *entitlements = NULL;
13504 	pmap_cs_code_directory_t *cd_entry = NULL;
13505 	kern_return_t ret = KERN_DENIED;
13506 
13507 	/* Validate the PMAP object */
13508 	validate_pmap(pmap);
13509 
13510 	/* Ensure no kernel PMAP */
13511 	if (pmap == kernel_pmap) {
13512 		return KERN_NOT_FOUND;
13513 	}
13514 
13515 	/* Attempt a shared lock on the PMAP */
13516 	if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13517 		return KERN_ABORTED;
13518 	}
13519 
13520 	/*
13521 	 * Acquire the code signature from the PMAP. This function is called when
13522 	 * performing an entitlement check, and since we've confirmed this isn't
13523 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13524 	 * with a code signature.
13525 	 */
13526 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13527 	if (cd_entry == NULL) {
13528 		ret = KERN_NOT_FOUND;
13529 		goto out;
13530 	}
13531 
13532 	entitlements = cd_entry->kernel_entitlements;
13533 	if (entitlements == NULL) {
13534 		ret = KERN_NOT_FOUND;
13535 		goto out;
13536 	}
13537 
13538 	/* Pin and write out the entitlements object pointer */
13539 	if (kernel_entitlements != NULL) {
13540 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13541 		*kernel_entitlements = entitlements;
13542 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13543 	}
13544 
13545 	/* Successfully resolved the entitlements */
13546 	ret = KERN_SUCCESS;
13547 
13548 out:
13549 	/* Unlock the code signature object */
13550 	if (cd_entry != NULL) {
13551 		lck_rw_unlock_shared(&cd_entry->rwlock);
13552 		cd_entry = NULL;
13553 	}
13554 
13555 	/* Unlock the PMAP object */
13556 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
13557 
13558 	return ret;
13559 }
13560 
13561 kern_return_t
13562 pmap_resolve_kernel_entitlements(
13563 	pmap_t pmap,
13564 	const void **kernel_entitlements)
13565 {
13566 	kern_return_t ret = KERN_DENIED;
13567 
13568 	do {
13569 		ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13570 	} while (ret == KERN_ABORTED);
13571 
13572 	return ret;
13573 }
13574 
13575 kern_return_t
13576 pmap_accelerate_entitlements_internal(
13577 	pmap_cs_code_directory_t *cd_entry)
13578 {
13579 	const coreentitlements_t *CoreEntitlements = NULL;
13580 	const CS_SuperBlob *superblob = NULL;
13581 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13582 	size_t signature_length = 0;
13583 	size_t acceleration_length = 0;
13584 	size_t required_length = 0;
13585 	kern_return_t ret = KERN_DENIED;
13586 
13587 	/* Setup the CoreEntitlements interface */
13588 	CoreEntitlements = &amfi->CoreEntitlements;
13589 
13590 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13591 
13592 	/* Acquire the lock on the code directory */
13593 	pmap_cs_lock_code_directory(cd_entry);
13594 
13595 	/*
13596 	 * Only reconstituted code signatures can be accelerated. This is only a policy
13597 	 * decision we make since this allows us to re-use any unused space within the
13598 	 * locked down code signature region. There is also a decent bit of validation
13599 	 * within the reconstitution function to ensure blobs are ordered and do not
13600 	 * contain any padding around them which can cause issues here.
13601 	 *
13602 	 * This also serves as a check to ensure the signature is trusted.
13603 	 */
13604 	if (cd_entry->unneeded_code_signature_unlocked == false) {
13605 		ret = KERN_DENIED;
13606 		goto out;
13607 	}
13608 
13609 	if (cd_entry->ce_ctx == NULL) {
13610 		ret = KERN_SUCCESS;
13611 		goto out;
13612 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13613 		ret = KERN_SUCCESS;
13614 		goto out;
13615 	}
13616 
13617 	/* We only support accelerating when size <= PAGE_SIZE */
13618 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13619 	if (ce_err != CoreEntitlements->kNoError) {
13620 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13621 			/* Small entitlement blobs aren't eligible */
13622 			ret = KERN_SUCCESS;
13623 			goto out;
13624 		}
13625 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13626 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13627 	} else if (acceleration_length > PAGE_SIZE) {
13628 		ret = KERN_ABORTED;
13629 		goto out;
13630 	}
13631 	assert(acceleration_length > 0);
13632 
13633 	superblob = cd_entry->superblob;
13634 	signature_length = ntohl(superblob->length);
13635 
13636 	/* Adjust the required length for the overhead structure -- can't overflow */
13637 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13638 	if (required_length > PAGE_SIZE) {
13639 		ret = KERN_ABORTED;
13640 		goto out;
13641 	}
13642 
13643 	/*
13644 	 * First we'll check if the code signature has enough space within the locked down
13645 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13646 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
13647 	 * free list.
13648 	 *
13649 	 * When we're storing the buffer within the code signature, we also need to make
13650 	 * sure we account for alignment of the buffer.
13651 	 */
13652 	const vm_address_t align_mask = sizeof(void*) - 1;
13653 	size_t required_length_within_sig = required_length + align_mask;
13654 
13655 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13656 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13657 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13658 
13659 		/* We need to resolve to the physical aperture */
13660 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13661 		acceleration_buf = (void*)phystokv(phys_addr);
13662 
13663 		/* Ensure the offset within the page wasn't lost */
13664 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13665 
13666 		acceleration_buf->allocated = false;
13667 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13668 	} else {
13669 		if (required_length <= pmap_cs_blob_limit) {
13670 			struct pmap_cs_blob *bucket = NULL;
13671 			size_t bucket_size = 0;
13672 
13673 			/* Allocate a buffer from the blob allocator */
13674 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13675 			if (ret != KERN_SUCCESS) {
13676 				goto out;
13677 			}
13678 			acceleration_buf = (void*)bucket->blob;
13679 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13680 		} else {
13681 			pmap_paddr_t phys_addr = 0;
13682 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13683 			if (ret != KERN_SUCCESS) {
13684 				goto out;
13685 			}
13686 			acceleration_buf = (void*)phystokv(phys_addr);
13687 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13688 		}
13689 		acceleration_buf->allocated = true;
13690 	}
13691 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13692 	acceleration_buf->length = acceleration_length;
13693 
13694 	/* Take the acceleration buffer lock */
13695 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13696 
13697 	/* Setup the global acceleration buffer state */
13698 	pmap_cs_acceleration_buf = acceleration_buf;
13699 
13700 	/* Accelerate the entitlements */
13701 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
13702 	if (ce_err != CoreEntitlements->kNoError) {
13703 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
13704 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13705 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
13706 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
13707 	}
13708 
13709 	/*
13710 	 * The global acceleration buffer lock is unlocked by the allocation function itself
13711 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
13712 	 * an assert that the lock is unlocked here since another thread could have acquired
13713 	 * it by now.
13714 	 */
13715 	ret = KERN_SUCCESS;
13716 
13717 out:
13718 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13719 	return ret;
13720 }
13721 
13722 kern_return_t
13723 pmap_accelerate_entitlements(
13724 	pmap_cs_code_directory_t *cd_entry)
13725 {
13726 	kern_return_t ret = KERN_DENIED;
13727 
13728 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
13729 	while (ret == KERN_RESOURCE_SHORTAGE) {
13730 		/* Allocate a page for the PPL */
13731 		pmap_alloc_page_for_ppl(0);
13732 
13733 		/* Try again */
13734 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
13735 	}
13736 
13737 	return ret;
13738 }
13739 
13740 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13741 
13742 MARK_AS_PMAP_TEXT bool
13743 pmap_lookup_in_loaded_trust_caches_internal(
13744 	const uint8_t cdhash[CS_CDHASH_LEN])
13745 {
13746 	kern_return_t kr = KERN_NOT_FOUND;
13747 
13748 #if PMAP_CS_PPL_MONITOR
13749 	/*
13750 	 * If we have the PPL monitor, then this function can only be called from
13751 	 * within the PPL. Calling it directly would've caused a panic, so we can
13752 	 * assume that we're in the PPL here.
13753 	 */
13754 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13755 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13756 
13757 	kr = pmap_query_trust_cache_safe(
13758 		kTCQueryTypeLoadable,
13759 		cdhash_safe,
13760 		NULL);
13761 #else
13762 	kr = query_trust_cache(
13763 		kTCQueryTypeLoadable,
13764 		cdhash,
13765 		NULL);
13766 #endif
13767 
13768 	if (kr == KERN_SUCCESS) {
13769 		return true;
13770 	}
13771 	return false;
13772 }
13773 
13774 bool
13775 pmap_lookup_in_loaded_trust_caches(
13776 	const uint8_t cdhash[CS_CDHASH_LEN])
13777 {
13778 #if XNU_MONITOR
13779 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
13780 #else
13781 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
13782 #endif
13783 }
13784 
13785 MARK_AS_PMAP_TEXT uint32_t
13786 pmap_lookup_in_static_trust_cache_internal(
13787 	const uint8_t cdhash[CS_CDHASH_LEN])
13788 {
13789 	TrustCacheQueryToken_t query_token = {0};
13790 	kern_return_t kr = KERN_NOT_FOUND;
13791 	uint64_t flags = 0;
13792 	uint8_t hash_type = 0;
13793 
13794 #if PMAP_CS_PPL_MONITOR
13795 	/*
13796 	 * If we have the PPL monitor, then this function can only be called from
13797 	 * within the PPL. Calling it directly would've caused a panic, so we can
13798 	 * assume that we're in the PPL here.
13799 	 */
13800 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13801 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13802 
13803 	kr = pmap_query_trust_cache_safe(
13804 		kTCQueryTypeStatic,
13805 		cdhash_safe,
13806 		&query_token);
13807 #else
13808 	kr = query_trust_cache(
13809 		kTCQueryTypeStatic,
13810 		cdhash,
13811 		&query_token);
13812 #endif
13813 
13814 	if (kr == KERN_SUCCESS) {
13815 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
13816 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
13817 
13818 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
13819 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
13820 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
13821 	}
13822 
13823 	return 0;
13824 }
13825 
13826 uint32_t
13827 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
13828 {
13829 #if XNU_MONITOR
13830 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
13831 #else
13832 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
13833 #endif
13834 }
13835 
13836 #if PMAP_CS_INCLUDE_CODE_SIGNING
13837 
13838 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
13839 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
13840 
13841 MARK_AS_PMAP_TEXT void
13842 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13843 {
13844 
13845 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13846 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
13847 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13848 
13849 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
13850 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
13851 }
13852 
13853 MARK_AS_PMAP_TEXT bool
13854 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13855 {
13856 	bool match = false;
13857 
13858 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13859 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
13860 		match = true;
13861 	}
13862 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13863 
13864 	if (match) {
13865 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
13866 	}
13867 
13868 	return match;
13869 }
13870 
13871 void
13872 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13873 {
13874 #if XNU_MONITOR
13875 	pmap_set_compilation_service_cdhash_ppl(cdhash);
13876 #else
13877 	pmap_set_compilation_service_cdhash_internal(cdhash);
13878 #endif
13879 }
13880 
13881 bool
13882 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13883 {
13884 #if XNU_MONITOR
13885 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
13886 #else
13887 	return pmap_match_compilation_service_cdhash_internal(cdhash);
13888 #endif
13889 }
13890 
13891 /*
13892  * As part of supporting local signing on the device, we need the PMAP layer
13893  * to store the local signing key so that PMAP_CS can validate with it. We
13894  * store it at the PMAP layer such that it is accessible to both AMFI and
13895  * PMAP_CS should they need it.
13896  */
13897 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
13898 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
13899 
13900 MARK_AS_PMAP_TEXT void
13901 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13902 {
13903 	bool key_set = false;
13904 
13905 	/*
13906 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
13907 	 * a successful exchange means that the local signing public key has _not_ been
13908 	 * set. In case the key has been set, we panic as we would never expect the
13909 	 * kernel to attempt to set the key more than once.
13910 	 */
13911 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
13912 
13913 	if (key_set) {
13914 		panic("attempted to set the local signing public key multiple times");
13915 	}
13916 
13917 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
13918 	pmap_cs_log_info("set local signing public key");
13919 }
13920 
13921 void
13922 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13923 {
13924 #if XNU_MONITOR
13925 	return pmap_set_local_signing_public_key_ppl(public_key);
13926 #else
13927 	return pmap_set_local_signing_public_key_internal(public_key);
13928 #endif
13929 }
13930 
13931 uint8_t*
13932 pmap_get_local_signing_public_key(void)
13933 {
13934 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
13935 
13936 	if (key_set) {
13937 		return pmap_local_signing_public_key;
13938 	}
13939 
13940 	return NULL;
13941 }
13942 
13943 /*
13944  * Locally signed applications need to be explicitly authorized by an entitled application
13945  * before we allow them to run.
13946  */
13947 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
13948 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
13949 
13950 MARK_AS_PMAP_TEXT void
13951 pmap_unrestrict_local_signing_internal(
13952 	const uint8_t cdhash[CS_CDHASH_LEN])
13953 {
13954 
13955 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13956 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13957 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13958 
13959 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
13960 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
13961 }
13962 
13963 void
13964 pmap_unrestrict_local_signing(
13965 	const uint8_t cdhash[CS_CDHASH_LEN])
13966 {
13967 #if XNU_MONITOR
13968 	return pmap_unrestrict_local_signing_ppl(cdhash);
13969 #else
13970 	return pmap_unrestrict_local_signing_internal(cdhash);
13971 #endif
13972 }
13973 
13974 #if PMAP_CS
13975 MARK_AS_PMAP_TEXT static void
13976 pmap_restrict_local_signing(void)
13977 {
13978 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13979 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
13980 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13981 }
13982 
13983 MARK_AS_PMAP_TEXT static bool
13984 pmap_local_signing_restricted(
13985 	const uint8_t cdhash[CS_CDHASH_LEN])
13986 {
13987 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13988 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13989 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13990 
13991 	return ret != 0;
13992 }
13993 
13994 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13995 #endif
13996 
13997 MARK_AS_PMAP_TEXT void
13998 pmap_footprint_suspend_internal(
13999 	vm_map_t        map,
14000 	boolean_t       suspend)
14001 {
14002 #if DEVELOPMENT || DEBUG
14003 	if (suspend) {
14004 		current_thread()->pmap_footprint_suspended = TRUE;
14005 		map->pmap->footprint_was_suspended = TRUE;
14006 	} else {
14007 		current_thread()->pmap_footprint_suspended = FALSE;
14008 	}
14009 #else /* DEVELOPMENT || DEBUG */
14010 	(void) map;
14011 	(void) suspend;
14012 #endif /* DEVELOPMENT || DEBUG */
14013 }
14014 
14015 void
14016 pmap_footprint_suspend(
14017 	vm_map_t map,
14018 	boolean_t suspend)
14019 {
14020 #if XNU_MONITOR
14021 	pmap_footprint_suspend_ppl(map, suspend);
14022 #else
14023 	pmap_footprint_suspend_internal(map, suspend);
14024 #endif
14025 }
14026 
14027 MARK_AS_PMAP_TEXT void
14028 pmap_nop_internal(pmap_t pmap __unused)
14029 {
14030 	validate_pmap_mutable(pmap);
14031 }
14032 
14033 void
14034 pmap_nop(pmap_t pmap)
14035 {
14036 #if XNU_MONITOR
14037 	pmap_nop_ppl(pmap);
14038 #else
14039 	pmap_nop_internal(pmap);
14040 #endif
14041 }
14042 
14043 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14044 
14045 struct page_table_dump_header {
14046 	uint64_t pa;
14047 	uint64_t num_entries;
14048 	uint64_t start_va;
14049 	uint64_t end_va;
14050 };
14051 
14052 static kern_return_t
14053 pmap_dump_page_tables_recurse(pmap_t pmap,
14054     const tt_entry_t *ttp,
14055     unsigned int cur_level,
14056     unsigned int level_mask,
14057     uint64_t start_va,
14058     void *buf_start,
14059     void *buf_end,
14060     size_t *bytes_copied)
14061 {
14062 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14063 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14064 
14065 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
14066 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14067 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14068 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14069 
14070 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
14071 
14072 	if (cur_level == pt_attr_root_level(pt_attr)) {
14073 		start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14074 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14075 	}
14076 
14077 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14078 	const tt_entry_t *tt_end = &ttp[num_entries];
14079 
14080 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14081 		return KERN_INSUFFICIENT_BUFFER_SIZE;
14082 	}
14083 
14084 	if (level_mask & (1U << cur_level)) {
14085 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14086 		header->pa = ml_static_vtop((vm_offset_t)ttp);
14087 		header->num_entries = num_entries;
14088 		header->start_va = start_va;
14089 		header->end_va = start_va + (num_entries * size);
14090 
14091 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14092 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14093 	}
14094 	uint64_t current_va = start_va;
14095 
14096 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14097 		tt_entry_t tte = *ttep;
14098 
14099 		if (!(tte & valid_mask)) {
14100 			continue;
14101 		}
14102 
14103 		if ((tte & type_mask) == type_block) {
14104 			continue;
14105 		} else {
14106 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14107 				panic("%s: corrupt entry %#llx at %p, "
14108 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14109 				    __FUNCTION__, tte, ttep,
14110 				    ttp, cur_level, bufp, buf_end);
14111 			}
14112 
14113 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14114 
14115 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14116 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
14117 
14118 			if (recurse_result != KERN_SUCCESS) {
14119 				return recurse_result;
14120 			}
14121 		}
14122 	}
14123 
14124 	return KERN_SUCCESS;
14125 }
14126 
14127 kern_return_t
14128 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14129 {
14130 	if (not_in_kdp) {
14131 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
14132 	}
14133 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14134 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
14135 }
14136 
14137 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14138 
14139 kern_return_t
14140 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14141     unsigned int level_mask __unused, size_t *bytes_copied __unused)
14142 {
14143 	return KERN_NOT_SUPPORTED;
14144 }
14145 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14146 
14147 
14148 #ifdef CONFIG_XNUPOST
14149 #ifdef __arm64__
14150 static volatile bool pmap_test_took_fault = false;
14151 
14152 static bool
14153 pmap_test_fault_handler(arm_saved_state_t * state)
14154 {
14155 	bool retval                 = false;
14156 	uint32_t esr                = get_saved_state_esr(state);
14157 	esr_exception_class_t class = ESR_EC(esr);
14158 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
14159 
14160 	if ((class == ESR_EC_DABORT_EL1) &&
14161 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14162 		pmap_test_took_fault = true;
14163 		/* return to the instruction immediately after the call to NX page */
14164 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14165 		retval = true;
14166 	}
14167 
14168 	return retval;
14169 }
14170 
14171 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14172 static NOKASAN bool
14173 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14174 {
14175 	pmap_t old_pmap = NULL;
14176 
14177 	pmap_test_took_fault = false;
14178 
14179 	/*
14180 	 * We're potentially switching pmaps without using the normal thread
14181 	 * mechanism; disable interrupts and preemption to avoid any unexpected
14182 	 * memory accesses.
14183 	 */
14184 	uint64_t old_int_state = pmap_interrupts_disable();
14185 	mp_disable_preemption();
14186 
14187 	if (pmap != NULL) {
14188 		old_pmap = current_pmap();
14189 		pmap_switch(pmap);
14190 
14191 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
14192 #if __ARM_PAN_AVAILABLE__
14193 		__builtin_arm_wsr("pan", 0);
14194 #endif /* __ARM_PAN_AVAILABLE__ */
14195 	}
14196 
14197 	ml_expect_fault_begin(pmap_test_fault_handler, va);
14198 
14199 	if (is_write) {
14200 		*((volatile uint64_t*)(va)) = 0xdec0de;
14201 	} else {
14202 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
14203 		(void)tmp;
14204 	}
14205 
14206 	/* Save the fault bool, and undo the gross stuff we did. */
14207 	bool took_fault = pmap_test_took_fault;
14208 	ml_expect_fault_end();
14209 
14210 	if (pmap != NULL) {
14211 #if __ARM_PAN_AVAILABLE__
14212 		__builtin_arm_wsr("pan", 1);
14213 #endif /* __ARM_PAN_AVAILABLE__ */
14214 
14215 		pmap_switch(old_pmap);
14216 	}
14217 
14218 	mp_enable_preemption();
14219 	pmap_interrupts_restore(old_int_state);
14220 	bool retval = (took_fault == should_fault);
14221 	return retval;
14222 }
14223 
14224 static bool
14225 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14226 {
14227 	bool retval = pmap_test_access(pmap, va, should_fault, false);
14228 
14229 	if (!retval) {
14230 		T_FAIL("%s: %s, "
14231 		    "pmap=%p, va=%p, should_fault=%u",
14232 		    __func__, should_fault ? "did not fault" : "faulted",
14233 		    pmap, (void*)va, (unsigned)should_fault);
14234 	}
14235 
14236 	return retval;
14237 }
14238 
14239 static bool
14240 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14241 {
14242 	bool retval = pmap_test_access(pmap, va, should_fault, true);
14243 
14244 	if (!retval) {
14245 		T_FAIL("%s: %s, "
14246 		    "pmap=%p, va=%p, should_fault=%u",
14247 		    __func__, should_fault ? "did not fault" : "faulted",
14248 		    pmap, (void*)va, (unsigned)should_fault);
14249 	}
14250 
14251 	return retval;
14252 }
14253 
14254 static bool
14255 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14256 {
14257 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14258 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14259 
14260 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14261 
14262 	if (!retval) {
14263 		T_FAIL("%s: bits=%u, "
14264 		    "pa=%p, should_be_set=%u",
14265 		    __func__, bits,
14266 		    (void*)pa, should_be_set);
14267 	}
14268 
14269 	return retval;
14270 }
14271 
14272 static __attribute__((noinline)) bool
14273 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14274 {
14275 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14276 	return retval;
14277 }
14278 
14279 static int
14280 pmap_test_test_config(unsigned int flags)
14281 {
14282 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
14283 	unsigned int map_count = 0;
14284 	unsigned long page_ratio = 0;
14285 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
14286 
14287 	if (!pmap) {
14288 		panic("Failed to allocate pmap");
14289 	}
14290 
14291 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14292 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14293 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14294 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14295 
14296 	if (pmap_page_size <= native_page_size) {
14297 		page_ratio = native_page_size / pmap_page_size;
14298 	} else {
14299 		/*
14300 		 * We claim to support a page_ratio of less than 1, which is
14301 		 * not currently supported by the pmap layer; panic.
14302 		 */
14303 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14304 		    "flags=%u",
14305 		    __func__, native_page_size, pmap_page_size,
14306 		    flags);
14307 	}
14308 
14309 	if (PAGE_RATIO > 1) {
14310 		/*
14311 		 * The kernel is deliberately pretending to have 16KB pages.
14312 		 * The pmap layer has code that supports this, so pretend the
14313 		 * page size is larger than it is.
14314 		 */
14315 		pmap_page_size = PAGE_SIZE;
14316 		native_page_size = PAGE_SIZE;
14317 	}
14318 
14319 	/*
14320 	 * Get two pages from the VM; one to be mapped wired, and one to be
14321 	 * mapped nonwired.
14322 	 */
14323 	vm_page_t unwired_vm_page = vm_page_grab();
14324 	vm_page_t wired_vm_page = vm_page_grab();
14325 
14326 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14327 		panic("Failed to grab VM pages");
14328 	}
14329 
14330 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14331 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14332 
14333 	pmap_paddr_t pa = ptoa(pn);
14334 	pmap_paddr_t wired_pa = ptoa(wired_pn);
14335 
14336 	/*
14337 	 * We'll start mappings at the second twig TT.  This keeps us from only
14338 	 * using the first entry in each TT, which would trivially be address
14339 	 * 0; one of the things we will need to test is retrieving the VA for
14340 	 * a given PTE.
14341 	 */
14342 	vm_map_address_t va_base = pmap_twig_size;
14343 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14344 
14345 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14346 		/*
14347 		 * Not exactly a functional failure, but this test relies on
14348 		 * there being a spare PTE slot we can use to pin the TT.
14349 		 */
14350 		panic("Cannot pin translation table");
14351 	}
14352 
14353 	/*
14354 	 * Create the wired mapping; this will prevent the pmap layer from
14355 	 * reclaiming our test TTs, which would interfere with this test
14356 	 * ("interfere" -> "make it panic").
14357 	 */
14358 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14359 
14360 #if XNU_MONITOR
14361 	/*
14362 	 * If the PPL is enabled, make sure that the kernel cannot write
14363 	 * to PPL memory.
14364 	 */
14365 	if (!pmap_ppl_disable) {
14366 		T_LOG("Validate that kernel cannot write to PPL memory.");
14367 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14368 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14369 	}
14370 #endif
14371 
14372 	/*
14373 	 * Create read-only mappings of the nonwired page; if the pmap does
14374 	 * not use the same page size as the kernel, create multiple mappings
14375 	 * so that the kernel page is fully mapped.
14376 	 */
14377 	for (map_count = 0; map_count < page_ratio; map_count++) {
14378 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14379 	}
14380 
14381 	/* Validate that all the PTEs have the expected PA and VA. */
14382 	for (map_count = 0; map_count < page_ratio; map_count++) {
14383 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14384 
14385 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14386 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14387 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14388 		}
14389 
14390 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14391 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14392 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14393 		}
14394 	}
14395 
14396 	T_LOG("Validate that reads to our mapping do not fault.");
14397 	pmap_test_read(pmap, va_base, false);
14398 
14399 	T_LOG("Validate that writes to our mapping fault.");
14400 	pmap_test_write(pmap, va_base, true);
14401 
14402 	T_LOG("Make the first mapping writable.");
14403 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14404 
14405 	T_LOG("Validate that writes to our mapping do not fault.");
14406 	pmap_test_write(pmap, va_base, false);
14407 
14408 
14409 	T_LOG("Make the first mapping execute-only");
14410 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14411 
14412 
14413 	T_LOG("Validate that reads to our mapping do not fault.");
14414 	pmap_test_read(pmap, va_base, false);
14415 
14416 	T_LOG("Validate that writes to our mapping fault.");
14417 	pmap_test_write(pmap, va_base, true);
14418 
14419 
14420 	/*
14421 	 * For page ratios of greater than 1: validate that writes to the other
14422 	 * mappings still fault.  Remove the mappings afterwards (we're done
14423 	 * with page ratio testing).
14424 	 */
14425 	for (map_count = 1; map_count < page_ratio; map_count++) {
14426 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14427 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14428 	}
14429 
14430 	T_LOG("Mark the page unreferenced and unmodified.");
14431 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14432 	pmap_test_check_refmod(pa, 0);
14433 
14434 	/*
14435 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
14436 	 * different protection/fault_type settings, and confirm that the
14437 	 * ref/mod state matches our expectations at each step.
14438 	 */
14439 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
14440 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14441 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14442 
14443 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
14444 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14445 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14446 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14447 
14448 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
14449 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14450 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14451 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14452 
14453 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
14454 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14455 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14456 
14457 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
14458 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14459 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14460 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14461 
14462 	/*
14463 	 * Shared memory testing; we'll have two mappings; one read-only,
14464 	 * one read-write.
14465 	 */
14466 	vm_map_address_t rw_base = va_base;
14467 	vm_map_address_t ro_base = va_base + pmap_page_size;
14468 
14469 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14470 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14471 
14472 	/*
14473 	 * Test that we take faults as expected for unreferenced/unmodified
14474 	 * pages.  Also test the arm_fast_fault interface, to ensure that
14475 	 * mapping permissions change as expected.
14476 	 */
14477 	T_LOG("!ref/!mod: expect no access");
14478 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14479 	pmap_test_read_write(pmap, ro_base, false, false);
14480 	pmap_test_read_write(pmap, rw_base, false, false);
14481 
14482 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14483 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14484 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14485 	pmap_test_read_write(pmap, ro_base, true, false);
14486 	pmap_test_read_write(pmap, rw_base, true, false);
14487 
14488 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14489 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14490 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14491 	pmap_test_read_write(pmap, ro_base, true, false);
14492 	pmap_test_read_write(pmap, rw_base, true, true);
14493 
14494 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14495 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14496 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14497 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14498 	pmap_test_read_write(pmap, ro_base, true, false);
14499 	pmap_test_read_write(pmap, rw_base, true, true);
14500 
14501 	T_LOG("RW protect both mappings; should not change protections.");
14502 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14503 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14504 	pmap_test_read_write(pmap, ro_base, true, false);
14505 	pmap_test_read_write(pmap, rw_base, true, true);
14506 
14507 	T_LOG("Read protect both mappings; RW mapping should become RO.");
14508 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14509 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14510 	pmap_test_read_write(pmap, ro_base, true, false);
14511 	pmap_test_read_write(pmap, rw_base, true, false);
14512 
14513 	T_LOG("RW protect the page; mappings should not change protections.");
14514 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14515 	pmap_page_protect(pn, VM_PROT_ALL);
14516 	pmap_test_read_write(pmap, ro_base, true, false);
14517 	pmap_test_read_write(pmap, rw_base, true, true);
14518 
14519 	T_LOG("Read protect the page; RW mapping should become RO.");
14520 	pmap_page_protect(pn, VM_PROT_READ);
14521 	pmap_test_read_write(pmap, ro_base, true, false);
14522 	pmap_test_read_write(pmap, rw_base, true, false);
14523 
14524 	T_LOG("Validate that disconnect removes all known mappings of the page.");
14525 	pmap_disconnect(pn);
14526 	if (!pmap_verify_free(pn)) {
14527 		T_FAIL("Page still has mappings");
14528 	}
14529 
14530 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
14531 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14532 	pmap_destroy(pmap);
14533 
14534 	T_LOG("Release the pages back to the VM.");
14535 	vm_page_lock_queues();
14536 	vm_page_free(unwired_vm_page);
14537 	vm_page_free(wired_vm_page);
14538 	vm_page_unlock_queues();
14539 
14540 	T_LOG("Testing successful!");
14541 	return 0;
14542 }
14543 #endif /* __arm64__ */
14544 
14545 kern_return_t
14546 pmap_test(void)
14547 {
14548 	T_LOG("Starting pmap_tests");
14549 #ifdef __arm64__
14550 	int flags = 0;
14551 	flags |= PMAP_CREATE_64BIT;
14552 
14553 #if __ARM_MIXED_PAGE_SIZE__
14554 	T_LOG("Testing VM_PAGE_SIZE_4KB");
14555 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14556 	T_LOG("Testing VM_PAGE_SIZE_16KB");
14557 	pmap_test_test_config(flags);
14558 #else /* __ARM_MIXED_PAGE_SIZE__ */
14559 	pmap_test_test_config(flags);
14560 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14561 
14562 #endif /* __arm64__ */
14563 	T_PASS("completed pmap_test successfully");
14564 	return KERN_SUCCESS;
14565 }
14566 #endif /* CONFIG_XNUPOST */
14567 
14568 /*
14569  * The following function should never make it to RELEASE code, since
14570  * it provides a way to get the PPL to modify text pages.
14571  */
14572 #if DEVELOPMENT || DEBUG
14573 
14574 #define ARM_UNDEFINED_INSN 0xe7f000f0
14575 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14576 
14577 /**
14578  * Forcibly overwrite executable text with an illegal instruction.
14579  *
14580  * @note Only used for xnu unit testing.
14581  *
14582  * @param pa The physical address to corrupt.
14583  *
14584  * @return KERN_SUCCESS on success.
14585  */
14586 kern_return_t
14587 pmap_test_text_corruption(pmap_paddr_t pa)
14588 {
14589 #if XNU_MONITOR
14590 	return pmap_test_text_corruption_ppl(pa);
14591 #else /* XNU_MONITOR */
14592 	return pmap_test_text_corruption_internal(pa);
14593 #endif /* XNU_MONITOR */
14594 }
14595 
14596 MARK_AS_PMAP_TEXT kern_return_t
14597 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14598 {
14599 	vm_offset_t va = phystokv(pa);
14600 	unsigned int pai = pa_index(pa);
14601 
14602 	assert(pa_valid(pa));
14603 
14604 	pvh_lock(pai);
14605 
14606 	pv_entry_t **pv_h  = pai_to_pvh(pai);
14607 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14608 #if defined(PVH_FLAG_EXEC)
14609 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14610 
14611 	if (need_ap_twiddle) {
14612 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14613 	}
14614 #endif /* defined(PVH_FLAG_EXEC) */
14615 
14616 	/*
14617 	 * The low bit in an instruction address indicates a THUMB instruction
14618 	 */
14619 	if (va & 1) {
14620 		va &= ~(vm_offset_t)1;
14621 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14622 	} else {
14623 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
14624 	}
14625 
14626 #if defined(PVH_FLAG_EXEC)
14627 	if (need_ap_twiddle) {
14628 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14629 	}
14630 #endif /* defined(PVH_FLAG_EXEC) */
14631 
14632 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14633 
14634 	pvh_unlock(pai);
14635 
14636 	return KERN_SUCCESS;
14637 }
14638 
14639 #endif /* DEVELOPMENT || DEBUG */
14640