xref: /xnu-10063.121.3/osfmk/arm/pmap/pmap.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2011-2021, 2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap_internal.h>
84 
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89 
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93 
94 #include <pexpert/device_tree.h>
95 
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98 
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102 
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106 
107 
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111 
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121 
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123 
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125 
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131 
132 const struct page_table_ops native_pt_ops =
133 {
134 	.alloc_id = alloc_asid,
135 	.free_id = free_asid,
136 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 	.wimg_to_pte = wimg_to_pte,
139 };
140 
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 	[0] = {
144 		.size       = ARM_16K_TT_L0_SIZE,
145 		.offmask    = ARM_16K_TT_L0_OFFMASK,
146 		.shift      = ARM_16K_TT_L0_SHIFT,
147 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 		.valid_mask = ARM_TTE_VALID,
149 		.type_mask  = ARM_TTE_TYPE_MASK,
150 		.type_block = ARM_TTE_TYPE_BLOCK
151 	},
152 	[1] = {
153 		.size       = ARM_16K_TT_L1_SIZE,
154 		.offmask    = ARM_16K_TT_L1_OFFMASK,
155 		.shift      = ARM_16K_TT_L1_SHIFT,
156 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 		.valid_mask = ARM_TTE_VALID,
158 		.type_mask  = ARM_TTE_TYPE_MASK,
159 		.type_block = ARM_TTE_TYPE_BLOCK
160 	},
161 	[2] = {
162 		.size       = ARM_16K_TT_L2_SIZE,
163 		.offmask    = ARM_16K_TT_L2_OFFMASK,
164 		.shift      = ARM_16K_TT_L2_SHIFT,
165 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 		.valid_mask = ARM_TTE_VALID,
167 		.type_mask  = ARM_TTE_TYPE_MASK,
168 		.type_block = ARM_TTE_TYPE_BLOCK
169 	},
170 	[3] = {
171 		.size       = ARM_16K_TT_L3_SIZE,
172 		.offmask    = ARM_16K_TT_L3_OFFMASK,
173 		.shift      = ARM_16K_TT_L3_SHIFT,
174 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 		.valid_mask = ARM_PTE_TYPE_VALID,
176 		.type_mask  = ARM_PTE_TYPE_MASK,
177 		.type_block = ARM_TTE_TYPE_L3BLOCK
178 	}
179 };
180 
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 	[0] = {
184 		.size       = ARM_4K_TT_L0_SIZE,
185 		.offmask    = ARM_4K_TT_L0_OFFMASK,
186 		.shift      = ARM_4K_TT_L0_SHIFT,
187 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 		.valid_mask = ARM_TTE_VALID,
189 		.type_mask  = ARM_TTE_TYPE_MASK,
190 		.type_block = ARM_TTE_TYPE_BLOCK
191 	},
192 	[1] = {
193 		.size       = ARM_4K_TT_L1_SIZE,
194 		.offmask    = ARM_4K_TT_L1_OFFMASK,
195 		.shift      = ARM_4K_TT_L1_SHIFT,
196 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 		.valid_mask = ARM_TTE_VALID,
198 		.type_mask  = ARM_TTE_TYPE_MASK,
199 		.type_block = ARM_TTE_TYPE_BLOCK
200 	},
201 	[2] = {
202 		.size       = ARM_4K_TT_L2_SIZE,
203 		.offmask    = ARM_4K_TT_L2_OFFMASK,
204 		.shift      = ARM_4K_TT_L2_SHIFT,
205 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 		.valid_mask = ARM_TTE_VALID,
207 		.type_mask  = ARM_TTE_TYPE_MASK,
208 		.type_block = ARM_TTE_TYPE_BLOCK
209 	},
210 	[3] = {
211 		.size       = ARM_4K_TT_L3_SIZE,
212 		.offmask    = ARM_4K_TT_L3_OFFMASK,
213 		.shift      = ARM_4K_TT_L3_SHIFT,
214 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 		.valid_mask = ARM_PTE_TYPE_VALID,
216 		.type_mask  = ARM_PTE_TYPE_MASK,
217 		.type_block = ARM_TTE_TYPE_L3BLOCK
218 	}
219 };
220 
221 const struct page_table_attr pmap_pt_attr_4k = {
222 	.pta_level_info = pmap_table_level_info_4k,
223 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 	.pta_max_level  = PMAP_TT_L3_LEVEL,
234 	.pta_ops = &native_pt_ops,
235 	.ap_ro = ARM_PTE_AP(AP_RORO),
236 	.ap_rw = ARM_PTE_AP(AP_RWRW),
237 	.ap_rona = ARM_PTE_AP(AP_RONA),
238 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
239 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 	.ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 	.pta_tcr_value  = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 	.pta_page_size  = 4096,
245 	.pta_pagezero_size = 4096,
246 	.pta_page_shift = 12,
247 };
248 
249 const struct page_table_attr pmap_pt_attr_16k = {
250 	.pta_level_info = pmap_table_level_info_16k,
251 	.pta_root_level = PMAP_TT_L1_LEVEL,
252 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
253 	.pta_max_level  = PMAP_TT_L3_LEVEL,
254 	.pta_ops = &native_pt_ops,
255 	.ap_ro = ARM_PTE_AP(AP_RORO),
256 	.ap_rw = ARM_PTE_AP(AP_RWRW),
257 	.ap_rona = ARM_PTE_AP(AP_RONA),
258 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
259 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
260 	.ap_x = ARM_PTE_PNX,
261 #if __ARM_MIXED_PAGE_SIZE__
262 	.pta_tcr_value  = TCR_EL1_16KB,
263 #endif /* __ARM_MIXED_PAGE_SIZE__ */
264 	.pta_page_size  = 16384,
265 	.pta_pagezero_size = 16384,
266 	.pta_page_shift = 14,
267 };
268 
269 #if __ARM_16K_PG__
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
271 #else /* !__ARM_16K_PG__ */
272 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
273 #endif /* !__ARM_16K_PG__ */
274 
275 
276 #if MACH_ASSERT
277 int vm_footprint_suspend_allowed = 1;
278 
279 extern int pmap_ledgers_panic;
280 extern int pmap_ledgers_panic_leeway;
281 
282 #endif /* MACH_ASSERT */
283 
284 #if DEVELOPMENT || DEBUG
285 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
286 	(current_thread()->pmap_footprint_suspended)
287 #else /* DEVELOPMENT || DEBUG */
288 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
289 #endif /* DEVELOPMENT || DEBUG */
290 
291 
292 /*
293  * Represents a tlb range that will be flushed before exiting
294  * the ppl.
295  * Used by phys_attribute_clear_range to defer flushing pages in
296  * this range until the end of the operation.
297  */
298 typedef struct pmap_tlb_flush_range {
299 	pmap_t ptfr_pmap;
300 	vm_map_address_t ptfr_start;
301 	vm_map_address_t ptfr_end;
302 	bool ptfr_flush_needed;
303 } pmap_tlb_flush_range_t;
304 
305 #if XNU_MONITOR
306 /*
307  * PPL External References.
308  */
309 extern vm_offset_t   segPPLDATAB;
310 extern unsigned long segSizePPLDATA;
311 extern vm_offset_t   segPPLTEXTB;
312 extern unsigned long segSizePPLTEXT;
313 extern vm_offset_t   segPPLDATACONSTB;
314 extern unsigned long segSizePPLDATACONST;
315 
316 
317 /*
318  * PPL Global Variables
319  */
320 
321 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
322 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
323 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
324 #else
325 const boolean_t pmap_ppl_disable = FALSE;
326 #endif
327 
328 /*
329  * Indicates if the PPL has started applying APRR.
330  * This variable is accessed from various assembly trampolines, so be sure to change
331  * those if you change the size or layout of this variable.
332  */
333 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
334 
335 extern void *pmap_stacks_start;
336 extern void *pmap_stacks_end;
337 
338 #endif /* !XNU_MONITOR */
339 
340 
341 
342 /* Virtual memory region for early allocation */
343 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
344 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
345 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
346 
347 extern uint8_t bootstrap_pagetables[];
348 
349 extern unsigned int not_in_kdp;
350 
351 extern vm_offset_t first_avail;
352 
353 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
354 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
355 extern vm_offset_t     static_memory_end;
356 
357 extern const vm_map_address_t physmap_base;
358 extern const vm_map_address_t physmap_end;
359 
360 extern int maxproc, hard_maxproc;
361 
362 /* The number of address bits one TTBR can cover. */
363 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
364 
365 /*
366  * The bounds on our TTBRs.  These are for sanity checking that
367  * an address is accessible by a TTBR before we attempt to map it.
368  */
369 
370 /* The level of the root of a page table. */
371 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
372 
373 /* The number of entries in the root TT of a page table. */
374 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
375 
376 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
377 const pmap_t    kernel_pmap = &kernel_pmap_store;
378 
379 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
380 
381 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
382 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
383 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
384 
385 typedef struct tt_free_entry {
386 	struct tt_free_entry    *next;
387 } tt_free_entry_t;
388 
389 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
390 
391 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
392 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
393 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
394 #define FREE_PAGE_SIZE_TT_MAX   4
395 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
396 unsigned int    free_two_page_size_tt_count MARK_AS_PMAP_DATA;
397 unsigned int    free_two_page_size_tt_max MARK_AS_PMAP_DATA;
398 #define FREE_TWO_PAGE_SIZE_TT_MAX       4
399 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
400 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
401 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
402 
403 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
404 
405 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
406 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
407 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
408 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
409 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
410 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
411 
412 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
413 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
414 
415 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
416 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
417 
418 /* Lock group used for all pmap object locks. */
419 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
420 
421 #if DEVELOPMENT || DEBUG
422 int nx_enabled = 1;                                     /* enable no-execute protection */
423 int allow_data_exec  = 0;                               /* No apps may execute data */
424 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
425 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
426 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
427 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
428 #else /* DEVELOPMENT || DEBUG */
429 const int nx_enabled = 1;                                       /* enable no-execute protection */
430 const int allow_data_exec  = 0;                         /* No apps may execute data */
431 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
432 #endif /* DEVELOPMENT || DEBUG */
433 
434 /**
435  * This variable is set true during hibernation entry to protect pmap data structures
436  * during image copying, and reset false on hibernation exit.
437  */
438 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
439 
440 #if MACH_ASSERT
441 static void pmap_check_ledgers(pmap_t pmap);
442 #else
443 static inline void
pmap_check_ledgers(__unused pmap_t pmap)444 pmap_check_ledgers(__unused pmap_t pmap)
445 {
446 }
447 #endif /* MACH_ASSERT */
448 
449 /**
450  * This helper function ensures that potentially-long-running batched PPL operations are
451  * called in preemptible context before entering the PPL, so that the PPL call may
452  * periodically exit to allow pending urgent ASTs to be taken.
453  */
454 static inline void
pmap_verify_preemptible(void)455 pmap_verify_preemptible(void)
456 {
457 	assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
458 }
459 
460 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
461 
462 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
463 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
464 
465 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
466 
467 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
468 #if defined(__arm64__)
469 /* end of shared region + 512MB for various purposes */
470 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
471 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
472     "Minimum address space size outside allowable range");
473 
474 // Max offset is 15.375GB for devices with "large" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
476 // Max offset is 11.375GB for devices with "small" memory config
477 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
478 
479 
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
481     "Large device address space size outside allowable range");
482 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
483     "Small device address space size outside allowable range");
484 
485 #  ifdef XNU_TARGET_OS_OSX
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
487 #  else
488 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
489 #  endif
490 #endif /* __arm64__ */
491 
492 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
493 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
494 #else
495 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
496 #endif
497 
498 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
499 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
500 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
501 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
502 #if !HAS_16BIT_ASID
503 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
504 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
505 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
506 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
507 #else
508 static uint16_t last_allocated_asid = 0;
509 #endif /* !HAS_16BIT_ASID */
510 
511 
512 #if __ARM_MIXED_PAGE_SIZE__
513 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
514 #endif
515 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
516 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
517 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
518 
519 /* PTE Define Macros */
520 
521 #define ARM_PTE_IS_COMPRESSED(x, p) \
522 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
523 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
524 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
525 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
526 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
527 
528 #define pte_is_wired(pte)                                                               \
529 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
530 
531 #define pte_was_writeable(pte) \
532 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
533 
534 #define pte_set_was_writeable(pte, was_writeable) \
535 	do {                                         \
536 	        if ((was_writeable)) {               \
537 	                (pte) |= ARM_PTE_WRITEABLE;  \
538 	        } else {                             \
539 	                (pte) &= ~ARM_PTE_WRITEABLE; \
540 	        }                                    \
541 	} while(0)
542 
543 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)544 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
545 {
546 	if (wired) {
547 		*ptep |= ARM_PTE_WIRED;
548 	} else {
549 		*ptep &= ~ARM_PTE_WIRED;
550 	}
551 	/*
552 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
553 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
554 	 * never reclaimed.
555 	 */
556 	if (pmap == kernel_pmap) {
557 		return;
558 	}
559 	unsigned short *ptd_wiredcnt_ptr;
560 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
561 	if (wired) {
562 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
563 	} else {
564 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
565 		if (__improbable(prev_wired == 0)) {
566 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
567 		}
568 	}
569 }
570 
571 #if HAS_FEAT_XS
572 
573 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)574 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
575 {
576 	if (__improbable(pt_attr->stage2)) {
577 		return false;
578 	}
579 	switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
580 	case CACHE_ATTRINDX_POSTED_XS:
581 	case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
582 		return true;
583 	default:
584 		return false;
585 	}
586 }
587 
588 #endif /* HAS_FEAT_XS */
589 
590 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                               \
591 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
592 	arm64_sync_tlb(strong);                                                                               \
593 }
594 
595 /*
596  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
597  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
598  * will observe the updated PTE.
599  */
600 #define FLUSH_PTE()                                                                     \
601 	__builtin_arm_dmb(DMB_ISH);
602 
603 /*
604  * Synchronize updates to PTEs that were previously valid and thus may be cached in
605  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
606  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
607  * program order will not issue until the DSB completes.  Prior loads may be reordered
608  * after the barrier, but their behavior should not be materially affected by the
609  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
610  * matter for loads until the access is re-driven well after the TLB update is
611  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
612  * we should be in a position to handle access faults.  For "voluntary" PTE access
613  * restriction due to unmapping or protection, the decision to restrict access should
614  * have a data dependency on prior loads in order to avoid a data race.
615  */
616 #define FLUSH_PTE_STRONG()                                                             \
617 	__builtin_arm_dsb(DSB_ISHST);
618 
619 /**
620  * Write enough page table entries to map a single VM page. On systems where the
621  * VM page size does not match the hardware page size, multiple page table
622  * entries will need to be written.
623  *
624  * @note This function does not emit a barrier to ensure these page table writes
625  *       have completed before continuing. This is commonly needed. In the case
626  *       where a DMB or DSB barrier is needed, then use the write_pte() and
627  *       write_pte_strong() functions respectively instead of this one.
628  *
629  * @param ptep Pointer to the first page table entry to update.
630  * @param pte The value to write into each page table entry. In the case that
631  *            multiple PTEs are updated to a non-empty value, then the address
632  *            in this value will automatically be incremented for each PTE
633  *            write.
634  */
635 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)636 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
637 {
638 	/**
639 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
640 	 * systems, which is why it's checked at runtime instead of compile time.
641 	 * The "unreachable" warning needs to be suppressed because it still is a
642 	 * compile time constant on some systems.
643 	 */
644 	__unreachable_ok_push
645 	if (TEST_PAGE_RATIO_4) {
646 		if (((uintptr_t)ptep) & 0x1f) {
647 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
648 			    __func__, ptep, (void*)pte);
649 		}
650 
651 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
652 			/**
653 			 * If we're writing an empty/compressed PTE value, then don't
654 			 * auto-increment the address for each PTE write.
655 			 */
656 			*ptep = pte;
657 			*(ptep + 1) = pte;
658 			*(ptep + 2) = pte;
659 			*(ptep + 3) = pte;
660 		} else {
661 			*ptep = pte;
662 			*(ptep + 1) = pte | 0x1000;
663 			*(ptep + 2) = pte | 0x2000;
664 			*(ptep + 3) = pte | 0x3000;
665 		}
666 	} else {
667 		*ptep = pte;
668 	}
669 	__unreachable_ok_pop
670 }
671 
672 /**
673  * Writes enough page table entries to map a single VM page and then ensures
674  * those writes complete by executing a Data Memory Barrier.
675  *
676  * @note The DMB issued by this function is not strong enough to protect against
677  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
678  *       instruction is going to immediately be called after this write, it's
679  *       recommended to call write_pte_strong() instead of this function.
680  *
681  * See the function header for write_pte_fast() for more details on the
682  * parameters.
683  */
684 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)685 write_pte(pt_entry_t *ptep, pt_entry_t pte)
686 {
687 	write_pte_fast(ptep, pte);
688 	FLUSH_PTE();
689 }
690 
691 /**
692  * Writes enough page table entries to map a single VM page and then ensures
693  * those writes complete by executing a Data Synchronization Barrier. This
694  * barrier provides stronger guarantees than the DMB executed by write_pte().
695  *
696  * @note This function is useful if you're going to immediately flush the TLB
697  *       after making the PTE write. A DSB is required to protect against the
698  *       TLB invalidate being reordered before the PTE write.
699  *
700  * See the function header for write_pte_fast() for more details on the
701  * parameters.
702  */
703 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)704 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
705 {
706 	write_pte_fast(ptep, pte);
707 	FLUSH_PTE_STRONG();
708 }
709 
710 /**
711  * Retrieve the pmap structure for the thread running on the current CPU.
712  */
713 pmap_t
current_pmap()714 current_pmap()
715 {
716 	const pmap_t current = vm_map_pmap(current_thread()->map);
717 
718 	assert(current != NULL);
719 
720 #if XNU_MONITOR
721 	/**
722 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
723 	 * decided by kernel-writable memory. This function is used in various parts
724 	 * of the PPL, and besides validating that the pointer returned by this
725 	 * function is indeed a pmap structure, it's also important to ensure that
726 	 * it's actually the current thread's pmap. This is because different pmaps
727 	 * will have access to different entitlements based on the code signature of
728 	 * their loaded process. So if a different user pmap is set in the current
729 	 * thread structure (in an effort to bypass code signing restrictions), even
730 	 * though the structure would validate correctly as it is a real pmap
731 	 * structure, it should fail here.
732 	 *
733 	 * This only needs to occur for user pmaps because the kernel pmap's root
734 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
735 	 * changed so it'd be redundant to check), and its code signing fields are
736 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
737 	 * it shouldn't be possible to set those fields. Due to that, an attacker
738 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
739 	 * this check won't accomplish anything as it doesn't provide any extra code
740 	 * signing entitlements.
741 	 */
742 	if ((current != kernel_pmap) &&
743 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
744 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
745 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
746 	}
747 #endif /* XNU_MONITOR */
748 
749 	return current;
750 }
751 
752 #if DEVELOPMENT || DEBUG
753 
754 /*
755  * Trace levels are controlled by a bitmask in which each
756  * level can be enabled/disabled by the (1<<level) position
757  * in the boot arg
758  * Level 0: PPL extension functionality
759  * Level 1: pmap lifecycle (create/destroy/switch)
760  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
761  * Level 3: internal state management (attributes/fast-fault)
762  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
763  */
764 
765 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
766 
767 #define PMAP_TRACE(level, ...) \
768 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
769 	        KDBG_RELEASE(__VA_ARGS__); \
770 	}
771 #else /* DEVELOPMENT || DEBUG */
772 
773 #define PMAP_TRACE(level, ...)
774 
775 #endif /* DEVELOPMENT || DEBUG */
776 
777 
778 /*
779  * Internal function prototypes (forward declarations).
780  */
781 
782 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
783 
784 static void pmap_set_reference(ppnum_t pn);
785 
786 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
787 
788 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
789 
790 static kern_return_t pmap_expand(
791 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
792 
793 static int pmap_remove_range(
794 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
795 
796 static tt_entry_t *pmap_tt1_allocate(
797 	pmap_t, vm_size_t, unsigned int);
798 
799 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
800 
801 static void pmap_tt1_deallocate(
802 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
803 
804 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
805 
806 static kern_return_t pmap_tt_allocate(
807 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
808 
809 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
810 
811 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
812 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
813 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
814 
815 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
816 
817 
818 static void pmap_unmap_commpage(
819 	pmap_t pmap);
820 
821 static boolean_t
822 pmap_is_64bit(pmap_t);
823 
824 
825 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
826 
827 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
828 
829 static bool pmap_update_cache_attributes_locked(
830 	ppnum_t, unsigned, bool);
831 
832 static boolean_t arm_clear_fast_fault(
833 	ppnum_t ppnum,
834 	vm_prot_t fault_type,
835 	pt_entry_t *pte_p);
836 
837 static void pmap_trim_self(pmap_t pmap);
838 static void pmap_trim_subord(pmap_t subord);
839 
840 
841 /*
842  * Temporary prototypes, while we wait for pmap_enter to move to taking an
843  * address instead of a page number.
844  */
845 static kern_return_t
846 pmap_enter_addr(
847 	pmap_t pmap,
848 	vm_map_address_t v,
849 	pmap_paddr_t pa,
850 	vm_prot_t prot,
851 	vm_prot_t fault_type,
852 	unsigned int flags,
853 	boolean_t wired);
854 
855 kern_return_t
856 pmap_enter_options_addr(
857 	pmap_t pmap,
858 	vm_map_address_t v,
859 	pmap_paddr_t pa,
860 	vm_prot_t prot,
861 	vm_prot_t fault_type,
862 	unsigned int flags,
863 	boolean_t wired,
864 	unsigned int options,
865 	__unused void   *arg,
866 	__unused pmap_mapping_type_t mapping_type);
867 
868 #ifdef CONFIG_XNUPOST
869 kern_return_t pmap_test(void);
870 #endif /* CONFIG_XNUPOST */
871 
872 PMAP_SUPPORT_PROTOTYPES(
873 	kern_return_t,
874 	arm_fast_fault, (pmap_t pmap,
875 	vm_map_address_t va,
876 	vm_prot_t fault_type,
877 	bool was_af_fault,
878 	bool from_user), ARM_FAST_FAULT_INDEX);
879 
880 PMAP_SUPPORT_PROTOTYPES(
881 	boolean_t,
882 	arm_force_fast_fault, (ppnum_t ppnum,
883 	vm_prot_t allow_mode,
884 	int options), ARM_FORCE_FAST_FAULT_INDEX);
885 
886 MARK_AS_PMAP_TEXT static boolean_t
887 arm_force_fast_fault_with_flush_range(
888 	ppnum_t ppnum,
889 	vm_prot_t allow_mode,
890 	int options,
891 	pmap_tlb_flush_range_t *flush_range);
892 
893 /**
894  * Definition of the states driving the batch cache attributes update
895  * state machine.
896  */
897 typedef struct {
898 	uint64_t page_index : 32,           /* The page index to be operated on */
899 	    state : 8,                      /* The current state of the update machine */
900 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
901 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
902 	:0;
903 } batch_set_cache_attr_state_t;
904 
905 /* Possible values of the "state" field. */
906 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
907 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
908 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
909 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
910 
911 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
912 
913 PMAP_SUPPORT_PROTOTYPES(
914 	batch_set_cache_attr_state_t,
915 	pmap_batch_set_cache_attributes, (
916 #if XNU_MONITOR
917 		volatile upl_page_info_t *user_page_list,
918 #else /* !XNU_MONITOR */
919 		upl_page_info_array_t user_page_list,
920 #endif /* XNU_MONITOR */
921 		batch_set_cache_attr_state_t state,
922 		unsigned int page_cnt,
923 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
924 
925 PMAP_SUPPORT_PROTOTYPES(
926 	kern_return_t,
927 	pmap_change_wiring, (pmap_t pmap,
928 	vm_map_address_t v,
929 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
930 
931 PMAP_SUPPORT_PROTOTYPES(
932 	pmap_t,
933 	pmap_create_options, (ledger_t ledger,
934 	vm_map_size_t size,
935 	unsigned int flags,
936 	kern_return_t * kr), PMAP_CREATE_INDEX);
937 
938 PMAP_SUPPORT_PROTOTYPES(
939 	void,
940 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
941 
942 PMAP_SUPPORT_PROTOTYPES(
943 	kern_return_t,
944 	pmap_enter_options, (pmap_t pmap,
945 	vm_map_address_t v,
946 	pmap_paddr_t pa,
947 	vm_prot_t prot,
948 	vm_prot_t fault_type,
949 	unsigned int flags,
950 	boolean_t wired,
951 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
952 
953 PMAP_SUPPORT_PROTOTYPES(
954 	pmap_paddr_t,
955 	pmap_find_pa, (pmap_t pmap,
956 	addr64_t va), PMAP_FIND_PA_INDEX);
957 
958 PMAP_SUPPORT_PROTOTYPES(
959 	kern_return_t,
960 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
961 
962 
963 PMAP_SUPPORT_PROTOTYPES(
964 	boolean_t,
965 	pmap_is_empty, (pmap_t pmap,
966 	vm_map_offset_t va_start,
967 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
968 
969 
970 PMAP_SUPPORT_PROTOTYPES(
971 	unsigned int,
972 	pmap_map_cpu_windows_copy, (ppnum_t pn,
973 	vm_prot_t prot,
974 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
975 
976 PMAP_SUPPORT_PROTOTYPES(
977 	void,
978 	pmap_ro_zone_memcpy, (zone_id_t zid,
979 	vm_offset_t va,
980 	vm_offset_t offset,
981 	const vm_offset_t new_data,
982 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
983 
984 PMAP_SUPPORT_PROTOTYPES(
985 	uint64_t,
986 	pmap_ro_zone_atomic_op, (zone_id_t zid,
987 	vm_offset_t va,
988 	vm_offset_t offset,
989 	zro_atomic_op_t op,
990 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
991 
992 PMAP_SUPPORT_PROTOTYPES(
993 	void,
994 	pmap_ro_zone_bzero, (zone_id_t zid,
995 	vm_offset_t va,
996 	vm_offset_t offset,
997 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
998 
999 PMAP_SUPPORT_PROTOTYPES(
1000 	vm_map_offset_t,
1001 	pmap_nest, (pmap_t grand,
1002 	pmap_t subord,
1003 	addr64_t vstart,
1004 	uint64_t size,
1005 	vm_map_offset_t vrestart,
1006 	kern_return_t * krp), PMAP_NEST_INDEX);
1007 
1008 PMAP_SUPPORT_PROTOTYPES(
1009 	void,
1010 	pmap_page_protect_options, (ppnum_t ppnum,
1011 	vm_prot_t prot,
1012 	unsigned int options,
1013 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1014 
1015 PMAP_SUPPORT_PROTOTYPES(
1016 	vm_map_address_t,
1017 	pmap_protect_options, (pmap_t pmap,
1018 	vm_map_address_t start,
1019 	vm_map_address_t end,
1020 	vm_prot_t prot,
1021 	unsigned int options,
1022 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1023 
1024 PMAP_SUPPORT_PROTOTYPES(
1025 	kern_return_t,
1026 	pmap_query_page_info, (pmap_t pmap,
1027 	vm_map_offset_t va,
1028 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1029 
1030 PMAP_SUPPORT_PROTOTYPES(
1031 	mach_vm_size_t,
1032 	pmap_query_resident, (pmap_t pmap,
1033 	vm_map_address_t start,
1034 	vm_map_address_t end,
1035 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1036 
1037 PMAP_SUPPORT_PROTOTYPES(
1038 	void,
1039 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1040 
1041 PMAP_SUPPORT_PROTOTYPES(
1042 	vm_map_address_t,
1043 	pmap_remove_options, (pmap_t pmap,
1044 	vm_map_address_t start,
1045 	vm_map_address_t end,
1046 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1047 
1048 
1049 PMAP_SUPPORT_PROTOTYPES(
1050 	void,
1051 	pmap_set_cache_attributes, (ppnum_t pn,
1052 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1053 
1054 PMAP_SUPPORT_PROTOTYPES(
1055 	void,
1056 	pmap_update_compressor_page, (ppnum_t pn,
1057 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1058 
1059 PMAP_SUPPORT_PROTOTYPES(
1060 	void,
1061 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1062 
1063 #if MACH_ASSERT || XNU_MONITOR
1064 PMAP_SUPPORT_PROTOTYPES(
1065 	void,
1066 	pmap_set_process, (pmap_t pmap,
1067 	int pid,
1068 	char *procname), PMAP_SET_PROCESS_INDEX);
1069 #endif
1070 
1071 PMAP_SUPPORT_PROTOTYPES(
1072 	void,
1073 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1074 
1075 PMAP_SUPPORT_PROTOTYPES(
1076 	vm_map_offset_t,
1077 	pmap_unnest_options, (pmap_t grand,
1078 	addr64_t vaddr,
1079 	uint64_t size,
1080 	vm_map_offset_t vrestart,
1081 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1082 
1083 PMAP_SUPPORT_PROTOTYPES(
1084 	void,
1085 	phys_attribute_set, (ppnum_t pn,
1086 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1087 
1088 PMAP_SUPPORT_PROTOTYPES(
1089 	void,
1090 	phys_attribute_clear, (ppnum_t pn,
1091 	unsigned int bits,
1092 	int options,
1093 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1094 
1095 #if __ARM_RANGE_TLBI__
1096 PMAP_SUPPORT_PROTOTYPES(
1097 	vm_map_address_t,
1098 	phys_attribute_clear_range, (pmap_t pmap,
1099 	vm_map_address_t start,
1100 	vm_map_address_t end,
1101 	unsigned int bits,
1102 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1103 #endif /* __ARM_RANGE_TLBI__ */
1104 
1105 
1106 PMAP_SUPPORT_PROTOTYPES(
1107 	void,
1108 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1109 
1110 PMAP_SUPPORT_PROTOTYPES(
1111 	void,
1112 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1113 
1114 PMAP_SUPPORT_PROTOTYPES(
1115 	void,
1116 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1117 
1118 PMAP_SUPPORT_PROTOTYPES(
1119 	void,
1120 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1121 
1122 PMAP_SUPPORT_PROTOTYPES(
1123 	void,
1124 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1125 
1126 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1127 PMAP_SUPPORT_PROTOTYPES(
1128 	void,
1129 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1130 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1131 
1132 /* Definition of the states used by pmap_trim(). */
1133 typedef enum {
1134 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1135 	PMAP_TRIM_STATE_START = 0,
1136 
1137 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1138 	PMAP_TRIM_STATE_GRAND_BEFORE,
1139 
1140 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1141 	PMAP_TRIM_STATE_GRAND_AFTER,
1142 
1143 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1144 	PMAP_TRIM_STATE_SUBORD,
1145 
1146 	/* Marks that trimming is finished. */
1147 	PMAP_TRIM_STATE_DONE,
1148 
1149 	/* Sentry enum for sanity checks. */
1150 	PMAP_TRIM_STATE_COUNT,
1151 } pmap_trim_state_t;
1152 
1153 PMAP_SUPPORT_PROTOTYPES(
1154 	pmap_trim_state_t,
1155 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1156 
1157 #if HAS_APPLE_PAC
1158 PMAP_SUPPORT_PROTOTYPES(
1159 	void *,
1160 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1161 PMAP_SUPPORT_PROTOTYPES(
1162 	void *,
1163 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1164 #endif /* HAS_APPLE_PAC */
1165 
1166 
1167 
1168 
1169 PMAP_SUPPORT_PROTOTYPES(
1170 	kern_return_t,
1171 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1172 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1173 
1174 PMAP_SUPPORT_PROTOTYPES(
1175 	kern_return_t,
1176 	pmap_load_trust_cache_with_type, (TCType_t type,
1177 	const vm_address_t pmap_img4_payload,
1178 	const vm_size_t pmap_img4_payload_len,
1179 	const vm_address_t img4_manifest,
1180 	const vm_size_t img4_manifest_len,
1181 	const vm_address_t img4_aux_manifest,
1182 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1183 
1184 PMAP_SUPPORT_PROTOTYPES(
1185 	void,
1186 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1187 
1188 PMAP_SUPPORT_PROTOTYPES(
1189 	kern_return_t,
1190 	pmap_query_trust_cache, (TCQueryType_t query_type,
1191 	const uint8_t cdhash[kTCEntryHashSize],
1192 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1193 
1194 PMAP_SUPPORT_PROTOTYPES(
1195 	errno_t,
1196 	pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1197 	const void *input_data,
1198 	size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1199 
1200 #if PMAP_CS_INCLUDE_CODE_SIGNING
1201 
1202 PMAP_SUPPORT_PROTOTYPES(
1203 	kern_return_t,
1204 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1205 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1206 
1207 PMAP_SUPPORT_PROTOTYPES(
1208 	kern_return_t,
1209 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1210 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1211 
1212 PMAP_SUPPORT_PROTOTYPES(
1213 	kern_return_t,
1214 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1215 	pmap_cs_profile_t * profile_obj),
1216 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1217 
1218 PMAP_SUPPORT_PROTOTYPES(
1219 	kern_return_t,
1220 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1221 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1222 
1223 PMAP_SUPPORT_PROTOTYPES(
1224 	kern_return_t,
1225 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1226 	const void *kernel_entitlements),
1227 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1228 
1229 PMAP_SUPPORT_PROTOTYPES(
1230 	kern_return_t,
1231 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1232 	const void **kernel_entitlements),
1233 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1234 
1235 PMAP_SUPPORT_PROTOTYPES(
1236 	kern_return_t,
1237 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1238 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1239 
1240 PMAP_SUPPORT_PROTOTYPES(
1241 	kern_return_t,
1242 	pmap_cs_allow_invalid, (pmap_t pmap),
1243 	PMAP_CS_ALLOW_INVALID_INDEX);
1244 
1245 PMAP_SUPPORT_PROTOTYPES(
1246 	void,
1247 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1248 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1249 
1250 PMAP_SUPPORT_PROTOTYPES(
1251 	bool,
1252 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1253 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1254 
1255 PMAP_SUPPORT_PROTOTYPES(
1256 	void,
1257 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1258 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1259 
1260 PMAP_SUPPORT_PROTOTYPES(
1261 	void,
1262 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1263 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1264 
1265 #endif
1266 
1267 PMAP_SUPPORT_PROTOTYPES(
1268 	uint32_t,
1269 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1270 
1271 PMAP_SUPPORT_PROTOTYPES(
1272 	bool,
1273 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1274 
1275 PMAP_SUPPORT_PROTOTYPES(
1276 	void,
1277 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1278 
1279 void pmap_footprint_suspend(vm_map_t    map,
1280     boolean_t   suspend);
1281 PMAP_SUPPORT_PROTOTYPES(
1282 	void,
1283 	pmap_footprint_suspend, (vm_map_t map,
1284 	boolean_t suspend),
1285 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1286 
1287 
1288 
1289 
1290 #if DEVELOPMENT || DEBUG
1291 PMAP_SUPPORT_PROTOTYPES(
1292 	kern_return_t,
1293 	pmap_test_text_corruption, (pmap_paddr_t),
1294 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1295 #endif /* DEVELOPMENT || DEBUG */
1296 
1297 /*
1298  * The low global vector page is mapped at a fixed alias.
1299  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1300  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1301  * to check both addresses anyway for backward compatibility. So for now
1302  * we leave H6 and H7 where they were.
1303  */
1304 #if (ARM_PGSHIFT == 14)
1305 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1306 #else
1307 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1308 #endif
1309 
1310 
1311 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1312 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1313 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1314 
1315 #if XNU_MONITOR
1316 
1317 #if __has_feature(ptrauth_calls)
1318 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1319 #else
1320 #define __ptrauth_ppl_handler
1321 #endif
1322 
1323 /*
1324  * Table of function pointers used for PPL dispatch.
1325  */
1326 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1327 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1328 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1329 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1330 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1331 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1332 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1333 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1334 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1335 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1336 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1337 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1338 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1339 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1340 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1341 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1342 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1343 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1344 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1345 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1346 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1347 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1348 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1349 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1350 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1351 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1352 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1353 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1354 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1355 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1356 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1357 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1358 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1359 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1360 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1361 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1362 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1363 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1364 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1365 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1366 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1367 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1368 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1369 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1370 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1371 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1372 	[PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1373 #if PMAP_CS_INCLUDE_CODE_SIGNING
1374 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1375 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1376 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1377 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1378 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1379 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1380 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1381 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1382 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1383 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1384 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1385 #endif
1386 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1387 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1388 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1389 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1390 #if HAS_APPLE_PAC
1391 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1392 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1393 #endif /* HAS_APPLE_PAC */
1394 #if __ARM_RANGE_TLBI__
1395 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1396 #endif /* __ARM_RANGE_TLBI__ */
1397 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1398 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1399 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1400 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1401 
1402 #if DEVELOPMENT || DEBUG
1403 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1404 #endif /* DEVELOPMENT || DEBUG */
1405 
1406 };
1407 #endif
1408 
1409 #if XNU_MONITOR
1410 /**
1411  * A convenience function for setting protections on a single physical
1412  * aperture or static region mapping without invalidating the TLB.
1413  *
1414  * @note This function does not perform any TLB invalidations. That must be done
1415  *       separately to be able to safely use the updated mapping.
1416  *
1417  * @note This function understands the difference between the VM page size and
1418  *       the kernel page size and will update multiple PTEs if the sizes differ.
1419  *       In other words, enough PTEs will always get updated to change the
1420  *       permissions on a PAGE_SIZE amount of memory.
1421  *
1422  * @note The PVH lock for the physical page represented by this mapping must
1423  *       already be locked.
1424  *
1425  * @note This function assumes the caller has already verified that the PTE
1426  *       pointer does indeed point to a physical aperture or static region page
1427  *       table. Please validate your inputs before passing it along to this
1428  *       function.
1429  *
1430  * @param ptep Pointer to the physical aperture or static region page table to
1431  *             update with a new XPRR index.
1432  * @param expected_perm The XPRR index that is expected to already exist at the
1433  *                      current mapping. If the current index doesn't match this
1434  *                      then the system will panic.
1435  * @param new_perm The new XPRR index to update the mapping with.
1436  */
1437 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1438 pmap_set_pte_xprr_perm(
1439 	pt_entry_t * const ptep,
1440 	unsigned int expected_perm,
1441 	unsigned int new_perm)
1442 {
1443 	assert(ptep != NULL);
1444 
1445 	pt_entry_t spte = *ptep;
1446 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1447 
1448 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1449 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1450 		    __func__, ptep, new_perm, expected_perm);
1451 	}
1452 
1453 	/**
1454 	 * The PTE involved should be valid, should not have the hint bit set, and
1455 	 * should have the expected XPRR index.
1456 	 */
1457 	if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1458 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1459 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1460 		    __func__, ptep, spte, new_perm, expected_perm);
1461 	}
1462 
1463 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1464 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1465 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1466 		    __func__, ptep, spte, new_perm, expected_perm);
1467 	}
1468 
1469 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1470 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1471 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1472 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1473 	}
1474 
1475 	pt_entry_t template = spte;
1476 	template &= ~ARM_PTE_XPRR_MASK;
1477 	template |= xprr_perm_to_pte(new_perm);
1478 
1479 	write_pte_strong(ptep, template);
1480 }
1481 
1482 /**
1483  * Update the protections on a single physical aperture mapping and invalidate
1484  * the TLB so the mapping can be used.
1485  *
1486  * @note The PVH lock for the physical page must already be locked.
1487  *
1488  * @param pai The physical address index of the page whose physical aperture
1489  *            mapping will be updated with new permissions.
1490  * @param expected_perm The XPRR index that is expected to already exist at the
1491  *                      current mapping. If the current index doesn't match this
1492  *                      then the system will panic.
1493  * @param new_perm The new XPRR index to update the mapping with.
1494  */
1495 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1496 pmap_set_xprr_perm(
1497 	unsigned int pai,
1498 	unsigned int expected_perm,
1499 	unsigned int new_perm)
1500 {
1501 	pvh_assert_locked(pai);
1502 
1503 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1504 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1505 
1506 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1507 
1508 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1509 	sync_tlb_flush();
1510 }
1511 
1512 /**
1513  * Update the protections on a range of physical aperture or static region
1514  * mappings and invalidate the TLB so the mappings can be used.
1515  *
1516  * @note Static region mappings can only be updated before machine_lockdown().
1517  *       Physical aperture mappings can be updated at any time.
1518  *
1519  * @param start The starting virtual address of the static region or physical
1520  *              aperture range whose permissions will be updated.
1521  * @param end The final (inclusive) virtual address of the static region or
1522  *            physical aperture range whose permissions will be updated.
1523  * @param expected_perm The XPRR index that is expected to already exist at the
1524  *                      current mappings. If the current indices don't match
1525  *                      this then the system will panic.
1526  * @param new_perm The new XPRR index to update the mappings with.
1527  */
1528 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1529 pmap_set_range_xprr_perm(
1530 	vm_address_t start,
1531 	vm_address_t end,
1532 	unsigned int expected_perm,
1533 	unsigned int new_perm)
1534 {
1535 	/**
1536 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1537 	 */
1538 	if (__improbable((start | end) & ARM_PGMASK)) {
1539 		panic_plain("%s: start or end not page aligned, "
1540 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1541 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1542 	}
1543 
1544 	if (__improbable(start > end)) {
1545 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1546 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1547 	}
1548 
1549 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1550 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1551 
1552 	if (__improbable(!(in_physmap || in_static))) {
1553 		panic_plain("%s: address not in static region or physical aperture, "
1554 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1555 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1556 	}
1557 
1558 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1559 		panic_plain("%s: invalid XPRR index, "
1560 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1561 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1562 	}
1563 
1564 	/*
1565 	 * Walk over the PTEs for the given range, and set the protections on those
1566 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1567 	 * one twig entry (whichever twig entry currently maps "va").
1568 	 */
1569 	vm_address_t va = start;
1570 	while (va < end) {
1571 		/**
1572 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1573 		 * PTEs from va to tte_va_end will have their permissions updated.
1574 		 */
1575 		vm_address_t tte_va_end =
1576 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1577 
1578 		if (tte_va_end > end) {
1579 			tte_va_end = end;
1580 		}
1581 
1582 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1583 
1584 		if (ttep == NULL) {
1585 			panic_plain("%s: physical aperture or static region tte is NULL, "
1586 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1587 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1588 		}
1589 
1590 		tt_entry_t tte = *ttep;
1591 
1592 		if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1593 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1594 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1595 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1596 		}
1597 
1598 		/* Walk over the given L3 page table page and update the PTEs. */
1599 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1600 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1601 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1602 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1603 
1604 		/**
1605 		 * The current PTE pointer is incremented by the page ratio (ratio of
1606 		 * VM page size to kernel hardware page size) because one call to
1607 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1608 		 * a PAGE_SIZE worth of hardware pages.
1609 		 */
1610 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1611 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1612 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1613 			pvh_lock(pai);
1614 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1615 			pvh_unlock(pai);
1616 		}
1617 
1618 		va = tte_va_end;
1619 	}
1620 
1621 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1622 }
1623 
1624 #endif /* XNU_MONITOR */
1625 
1626 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1627 PMAP_ZINFO_PALLOC(
1628 	pmap_t pmap, int bytes)
1629 {
1630 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1631 }
1632 
1633 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1634 PMAP_ZINFO_PFREE(
1635 	pmap_t pmap,
1636 	int bytes)
1637 {
1638 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1639 }
1640 
1641 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1642 pmap_tt_ledger_credit(
1643 	pmap_t          pmap,
1644 	vm_size_t       size)
1645 {
1646 	if (pmap != kernel_pmap) {
1647 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1648 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1649 	}
1650 }
1651 
1652 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1653 pmap_tt_ledger_debit(
1654 	pmap_t          pmap,
1655 	vm_size_t       size)
1656 {
1657 	if (pmap != kernel_pmap) {
1658 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1659 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1660 	}
1661 }
1662 
1663 static inline void
pmap_update_plru(uint16_t asid_index __unused)1664 pmap_update_plru(uint16_t asid_index __unused)
1665 {
1666 #if !HAS_16BIT_ASID
1667 	if (__probable(pmap_asid_plru)) {
1668 		unsigned plru_index = asid_index >> 6;
1669 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1670 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1671 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1672 		}
1673 	}
1674 #endif /* !HAS_16BIT_ASID */
1675 }
1676 
1677 static bool
alloc_asid(pmap_t pmap)1678 alloc_asid(pmap_t pmap)
1679 {
1680 	int vasid = -1;
1681 	uint16_t hw_asid;
1682 
1683 	pmap_simple_lock(&asid_lock);
1684 
1685 #if !HAS_16BIT_ASID
1686 	if (__probable(pmap_asid_plru)) {
1687 		unsigned plru_index = 0;
1688 		uint64_t lowest_gen = asid_plru_generation[0];
1689 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1690 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1691 			if (asid_plru_generation[i] < lowest_gen) {
1692 				plru_index = i;
1693 				lowest_gen = asid_plru_generation[i];
1694 				lowest_gen_bitmap = asid_plru_bitmap[i];
1695 			}
1696 		}
1697 
1698 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1699 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1700 			if (temp_plru) {
1701 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1702 #if DEVELOPMENT || DEBUG
1703 				++pmap_asid_hits;
1704 #endif
1705 				break;
1706 			}
1707 		}
1708 	}
1709 #else
1710 	/**
1711 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1712 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1713 	 * However, we first try to allocate starting from the position of the most-recently allocated
1714 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1715 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1716 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1717 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1718 	 * logic, without requiring prohibitively expensive RCTX instructions.
1719 	 */
1720 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1721 #endif /* !HAS_16BIT_ASID */
1722 	if (__improbable(vasid < 0)) {
1723 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1724 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1725 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1726 #if DEVELOPMENT || DEBUG
1727 		++pmap_asid_misses;
1728 #endif
1729 	}
1730 	if (__improbable(vasid < 0)) {
1731 		pmap_simple_unlock(&asid_lock);
1732 		return false;
1733 	}
1734 	assert((uint32_t)vasid < pmap_max_asids);
1735 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1736 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1737 #if HAS_16BIT_ASID
1738 	last_allocated_asid = (uint16_t)vasid;
1739 #endif /* HAS_16BIT_ASID */
1740 	pmap_simple_unlock(&asid_lock);
1741 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1742 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1743 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1744 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1745 		 * reassign to a reserved VASID. */
1746 		assert(pmap->sw_asid < UINT8_MAX);
1747 		pmap->sw_asid = UINT8_MAX;
1748 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1749 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1750 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1751 		assert(hw_asid < MAX_HW_ASIDS);
1752 	}
1753 	pmap_update_plru(hw_asid);
1754 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1755 #if __ARM_KERNEL_PROTECT__
1756 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1757 #endif
1758 	pmap->hw_asid = hw_asid;
1759 	return true;
1760 }
1761 
1762 static void
free_asid(pmap_t pmap)1763 free_asid(pmap_t pmap)
1764 {
1765 	unsigned int vasid;
1766 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1767 	if (__improbable(hw_asid == 0)) {
1768 		return;
1769 	}
1770 
1771 #if __ARM_KERNEL_PROTECT__
1772 	hw_asid >>= 1;
1773 #endif
1774 	hw_asid -= 1;
1775 
1776 #if HAS_16BIT_ASID
1777 	vasid = hw_asid;
1778 #else
1779 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1780 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1781 	} else {
1782 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1783 	}
1784 
1785 	if (__probable(pmap_asid_plru)) {
1786 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1787 	}
1788 #endif /* HAS_16BIT_ASID */
1789 	pmap_simple_lock(&asid_lock);
1790 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1791 	bitmap_set(&asid_bitmap[0], vasid);
1792 	pmap_simple_unlock(&asid_lock);
1793 }
1794 
1795 
1796 boolean_t
pmap_valid_address(pmap_paddr_t addr)1797 pmap_valid_address(
1798 	pmap_paddr_t addr)
1799 {
1800 	return pa_valid(addr);
1801 }
1802 
1803 
1804 
1805 
1806 
1807 
1808 /*
1809  *      Map memory at initialization.  The physical addresses being
1810  *      mapped are not managed and are never unmapped.
1811  *
1812  *      For now, VM is already on, we only need to map the
1813  *      specified memory.
1814  */
1815 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1816 pmap_map(
1817 	vm_map_address_t virt,
1818 	vm_offset_t start,
1819 	vm_offset_t end,
1820 	vm_prot_t prot,
1821 	unsigned int flags)
1822 {
1823 	kern_return_t   kr;
1824 	vm_size_t       ps;
1825 
1826 	ps = PAGE_SIZE;
1827 	while (start < end) {
1828 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1829 		    prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1830 
1831 		if (kr != KERN_SUCCESS) {
1832 			panic("%s: failed pmap_enter, "
1833 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1834 			    __FUNCTION__,
1835 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1836 		}
1837 
1838 		virt += ps;
1839 		start += ps;
1840 	}
1841 	return virt;
1842 }
1843 
1844 #if XNU_MONITOR
1845 /**
1846  * Remove kernel writeablity from an IO PTE value if the page is owned by
1847  * guarded mode software.
1848  *
1849  * @param paddr The physical address of the page which has to be non-DRAM.
1850  * @param tmplate The PTE value to be evaluated.
1851  *
1852  * @return A new PTE value with permission bits modified.
1853  */
1854 static inline
1855 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1856 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1857 {
1858 	assert(!pa_valid(paddr));
1859 
1860 	const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1861 
1862 	if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1863 		/* PPL to own the page by converting KERN_RW to PPL_RW. */
1864 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1865 		switch (xprr_perm) {
1866 		case XPRR_KERN_RO_PERM:
1867 			break;
1868 		case XPRR_KERN_RW_PERM:
1869 			tmplate &= ~ARM_PTE_XPRR_MASK;
1870 			tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1871 			break;
1872 		default:
1873 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1874 		}
1875 	}
1876 
1877 	return tmplate;
1878 }
1879 #endif /* XNU_MONITOR */
1880 
1881 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1882 pmap_map_bd_with_options(
1883 	vm_map_address_t virt,
1884 	vm_offset_t start,
1885 	vm_offset_t end,
1886 	vm_prot_t prot,
1887 	int32_t options)
1888 {
1889 	pt_entry_t      mem_attr;
1890 
1891 	switch (options & PMAP_MAP_BD_MASK) {
1892 	case PMAP_MAP_BD_WCOMB:
1893 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1894 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1895 		break;
1896 	case PMAP_MAP_BD_POSTED:
1897 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1898 		break;
1899 	case PMAP_MAP_BD_POSTED_REORDERED:
1900 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1901 		break;
1902 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1903 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1904 		break;
1905 	default:
1906 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1907 		break;
1908 	}
1909 
1910 	/* not cacheable and not buffered */
1911 	pt_entry_t tmplate = pa_to_pte(start)
1912 	    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1913 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1914 	    | mem_attr;
1915 
1916 #if __ARM_KERNEL_PROTECT__
1917 	tmplate |= ARM_PTE_NG;
1918 #endif /* __ARM_KERNEL_PROTECT__ */
1919 
1920 	vm_map_address_t vaddr = virt;
1921 	vm_offset_t paddr = start;
1922 	while (paddr < end) {
1923 		pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
1924 		if (ptep == PT_ENTRY_NULL) {
1925 			panic("pmap_map_bd");
1926 		}
1927 
1928 		/**
1929 		 * For every iteration, the paddr encoded in tmplate is incrementing,
1930 		 * but we always start with the original AP bits defined at the top
1931 		 * of the function in tmplate and only modify the AP bits in the pte
1932 		 * variable.
1933 		 */
1934 		pt_entry_t pte;
1935 #if XNU_MONITOR
1936 		if (!pa_valid(paddr)) {
1937 			pte = pmap_construct_io_pte(paddr, tmplate);
1938 		} else {
1939 			pte = tmplate;
1940 		}
1941 #else /* !XNU_MONITOR */
1942 		pte = tmplate;
1943 #endif
1944 
1945 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1946 		write_pte_strong(ptep, pte);
1947 
1948 		pte_increment_pa(tmplate);
1949 		vaddr += PAGE_SIZE;
1950 		paddr += PAGE_SIZE;
1951 	}
1952 
1953 	if (end >= start) {
1954 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1955 	}
1956 
1957 	return vaddr;
1958 }
1959 
1960 /*
1961  *      Back-door routine for mapping kernel VM at initialization.
1962  *      Useful for mapping memory outside the range
1963  *      [vm_first_phys, vm_last_phys] (i.e., devices).
1964  *      Otherwise like pmap_map.
1965  */
1966 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1967 pmap_map_bd(
1968 	vm_map_address_t virt,
1969 	vm_offset_t start,
1970 	vm_offset_t end,
1971 	vm_prot_t prot)
1972 {
1973 	return pmap_map_bd_with_options(virt, start, end, prot, 0);
1974 }
1975 
1976 /*
1977  *      Back-door routine for mapping kernel VM at initialization.
1978  *      Useful for mapping memory specific physical addresses in early
1979  *      boot (i.e., before kernel_map is initialized).
1980  *
1981  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
1982  */
1983 
1984 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1985 pmap_map_high_window_bd(
1986 	vm_offset_t pa_start,
1987 	vm_size_t len,
1988 	vm_prot_t prot)
1989 {
1990 	pt_entry_t              *ptep, pte;
1991 	vm_map_address_t        va_start = VREGION1_START;
1992 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
1993 	vm_map_address_t        va_end;
1994 	vm_map_address_t        va;
1995 	vm_size_t               offset;
1996 
1997 	offset = pa_start & PAGE_MASK;
1998 	pa_start -= offset;
1999 	len += offset;
2000 
2001 	if (len > (va_max - va_start)) {
2002 		panic("%s: area too large, "
2003 		    "pa_start=%p, len=%p, prot=0x%x",
2004 		    __FUNCTION__,
2005 		    (void*)pa_start, (void*)len, prot);
2006 	}
2007 
2008 scan:
2009 	for (; va_start < va_max; va_start += PAGE_SIZE) {
2010 		ptep = pmap_pte(kernel_pmap, va_start);
2011 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2012 		if (*ptep == ARM_PTE_TYPE_FAULT) {
2013 			break;
2014 		}
2015 	}
2016 	if (va_start > va_max) {
2017 		panic("%s: insufficient pages, "
2018 		    "pa_start=%p, len=%p, prot=0x%x",
2019 		    __FUNCTION__,
2020 		    (void*)pa_start, (void*)len, prot);
2021 	}
2022 
2023 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2024 		ptep = pmap_pte(kernel_pmap, va_end);
2025 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2026 		if (*ptep != ARM_PTE_TYPE_FAULT) {
2027 			va_start = va_end + PAGE_SIZE;
2028 			goto scan;
2029 		}
2030 	}
2031 
2032 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2033 		ptep = pmap_pte(kernel_pmap, va);
2034 		pte = pa_to_pte(pa_start)
2035 		    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2036 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2037 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2038 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2039 #if __ARM_KERNEL_PROTECT__
2040 		pte |= ARM_PTE_NG;
2041 #endif /* __ARM_KERNEL_PROTECT__ */
2042 		write_pte_strong(ptep, pte);
2043 	}
2044 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2045 #if KASAN
2046 	kasan_notify_address(va_start, len);
2047 #endif
2048 	return va_start;
2049 }
2050 
2051 static uint32_t
pmap_compute_max_asids(void)2052 pmap_compute_max_asids(void)
2053 {
2054 	DTEntry entry;
2055 	void const *prop = NULL;
2056 	uint32_t max_asids;
2057 	int err;
2058 	unsigned int prop_size;
2059 
2060 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2061 	assert(err == kSuccess);
2062 
2063 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2064 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
2065 		 * we can choose a more flexible default value here. */
2066 		return MAX_ASIDS;
2067 	}
2068 
2069 	if (prop_size != sizeof(max_asids)) {
2070 		panic("pmap-max-asids property is not a 32-bit integer");
2071 	}
2072 
2073 	max_asids = *((uint32_t const *)prop);
2074 #if HAS_16BIT_ASID
2075 	if (max_asids > MAX_HW_ASIDS) {
2076 		panic("pmap-max-asids 0x%x too large", max_asids);
2077 	}
2078 #else
2079 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2080 	max_asids = (max_asids + 63) & ~63UL;
2081 
2082 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2083 		/* currently capped by size of pmap->sw_asid */
2084 		panic("pmap-max-asids 0x%x too large", max_asids);
2085 	}
2086 #endif /* HAS_16BIT_ASID */
2087 	if (max_asids == 0) {
2088 		panic("pmap-max-asids cannot be zero");
2089 	}
2090 	return max_asids;
2091 }
2092 
2093 #if __arm64__
2094 /*
2095  * pmap_get_arm64_prot
2096  *
2097  * return effective armv8 VMSA block protections including
2098  * table AP/PXN/XN overrides of a pmap entry
2099  *
2100  */
2101 
2102 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2103 pmap_get_arm64_prot(
2104 	pmap_t pmap,
2105 	vm_offset_t addr)
2106 {
2107 	tt_entry_t tte = 0;
2108 	unsigned int level = 0;
2109 	uint64_t tte_type = 0;
2110 	uint64_t effective_prot_bits = 0;
2111 	uint64_t aggregate_tte = 0;
2112 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2113 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2114 
2115 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2116 		tte = *pmap_ttne(pmap, level, addr);
2117 
2118 		if (!(tte & ARM_TTE_VALID)) {
2119 			return 0;
2120 		}
2121 
2122 		tte_type = tte & ARM_TTE_TYPE_MASK;
2123 
2124 		if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2125 		    (level == pt_attr->pta_max_level)) {
2126 			/* Block or page mapping; both have the same protection bit layout. */
2127 			break;
2128 		} else if (tte_type == ARM_TTE_TYPE_TABLE) {
2129 			/* All of the table bits we care about are overrides, so just OR them together. */
2130 			aggregate_tte |= tte;
2131 		}
2132 	}
2133 
2134 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2135 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2136 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2137 
2138 	/* Start with the PTE bits. */
2139 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2140 
2141 	/* Table AP bits mask out block/page AP bits */
2142 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2143 
2144 	/* XN/PXN bits can be OR'd in. */
2145 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2146 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2147 
2148 	return effective_prot_bits;
2149 }
2150 #endif /* __arm64__ */
2151 
2152 /*
2153  *	Bootstrap the system enough to run with virtual memory.
2154  *
2155  *	The early VM initialization code has already allocated
2156  *	the first CPU's translation table and made entries for
2157  *	all the one-to-one mappings to be found there.
2158  *
2159  *	We must set up the kernel pmap structures, the
2160  *	physical-to-virtual translation lookup tables for the
2161  *	physical memory to be managed (between avail_start and
2162  *	avail_end).
2163  *
2164  *	Map the kernel's code and data, and allocate the system page table.
2165  *	Page_size must already be set.
2166  *
2167  *	Parameters:
2168  *	first_avail	first available physical page -
2169  *			   after kernel page tables
2170  *	avail_start	PA of first managed physical page
2171  *	avail_end	PA of last managed physical page
2172  */
2173 
2174 void
pmap_bootstrap(vm_offset_t vstart)2175 pmap_bootstrap(
2176 	vm_offset_t vstart)
2177 {
2178 	vm_map_offset_t maxoffset;
2179 
2180 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2181 
2182 #if XNU_MONITOR
2183 
2184 #if DEVELOPMENT || DEBUG
2185 	PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2186 #endif
2187 
2188 #if CONFIG_CSR_FROM_DT
2189 	if (csr_unsafe_kernel_text) {
2190 		pmap_ppl_disable = true;
2191 	}
2192 #endif /* CONFIG_CSR_FROM_DT */
2193 
2194 #endif /* XNU_MONITOR */
2195 
2196 #if DEVELOPMENT || DEBUG
2197 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2198 		kprintf("Kernel traces for pmap operations enabled\n");
2199 	}
2200 #endif
2201 
2202 	/*
2203 	 *	Initialize the kernel pmap.
2204 	 */
2205 #if ARM_PARAMETERIZED_PMAP
2206 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2207 #endif /* ARM_PARAMETERIZED_PMAP */
2208 #if HAS_APPLE_PAC
2209 	kernel_pmap->disable_jop = 0;
2210 #endif /* HAS_APPLE_PAC */
2211 	kernel_pmap->tte = cpu_tte;
2212 	kernel_pmap->ttep = cpu_ttep;
2213 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2214 	kernel_pmap->max = UINTPTR_MAX;
2215 	os_atomic_init(&kernel_pmap->ref_count, 1);
2216 #if XNU_MONITOR
2217 	os_atomic_init(&kernel_pmap->nested_count, 0);
2218 #endif
2219 	kernel_pmap->nx_enabled = TRUE;
2220 #ifdef  __arm64__
2221 	kernel_pmap->is_64bit = TRUE;
2222 #else
2223 	kernel_pmap->is_64bit = FALSE;
2224 #endif
2225 #if CONFIG_ROSETTA
2226 	kernel_pmap->is_rosetta = FALSE;
2227 #endif
2228 
2229 #if ARM_PARAMETERIZED_PMAP
2230 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2231 #endif /* ARM_PARAMETERIZED_PMAP */
2232 
2233 	kernel_pmap->nested_region_addr = 0x0ULL;
2234 	kernel_pmap->nested_region_size = 0x0ULL;
2235 	kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2236 	kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2237 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2238 
2239 	kernel_pmap->hw_asid = 0;
2240 	kernel_pmap->sw_asid = 0;
2241 
2242 	pmap_lock_init(kernel_pmap);
2243 
2244 	pmap_max_asids = pmap_compute_max_asids();
2245 #if HAS_16BIT_ASID
2246 	asid_chunk_size = MAX_HW_ASIDS;
2247 #else
2248 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2249 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2250 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2251 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2252 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2253 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2254 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2255 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2256 #endif /* HAS_16BIT_ASIDS */
2257 
2258 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2259 
2260 	/**
2261 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2262 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2263 	 * space for these data structures.
2264 	 */
2265 	pmap_data_bootstrap();
2266 
2267 	/**
2268 	 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2269 	 */
2270 	uat_bootstrap();
2271 
2272 
2273 	/**
2274 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2275 	 */
2276 	sart_bootstrap();
2277 
2278 	/**
2279 	 * Don't make any assumptions about the alignment of avail_start before this
2280 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2281 	 */
2282 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2283 
2284 	const pmap_paddr_t pmap_struct_start = avail_start;
2285 
2286 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2287 	avail_start = round_page(avail_start + asid_table_size);
2288 
2289 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2290 
2291 	vm_first_phys = gPhysBase;
2292 	vm_last_phys = trunc_page(avail_end);
2293 
2294 	queue_init(&map_pmap_list);
2295 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2296 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2297 	free_page_size_tt_count = 0;
2298 	free_page_size_tt_max = 0;
2299 	free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2300 	free_two_page_size_tt_count = 0;
2301 	free_two_page_size_tt_max = 0;
2302 	free_tt_list = TT_FREE_ENTRY_NULL;
2303 	free_tt_count = 0;
2304 	free_tt_max = 0;
2305 
2306 	virtual_space_start = vstart;
2307 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2308 
2309 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2310 #if !HAS_16BIT_ASID
2311 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2312 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2313 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2314 #endif /* !HAS_16BIT_ASID */
2315 
2316 
2317 
2318 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2319 		maxoffset = trunc_page(maxoffset);
2320 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2321 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2322 			arm_pmap_max_offset_default = maxoffset;
2323 		}
2324 	}
2325 #if defined(__arm64__)
2326 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2327 		maxoffset = trunc_page(maxoffset);
2328 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2329 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2330 			arm64_pmap_max_offset_default = maxoffset;
2331 		}
2332 	}
2333 #endif
2334 
2335 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2336 
2337 
2338 #if PMAP_CS_PPL_MONITOR
2339 	/* Initialize the PPL trust cache read-write lock */
2340 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2341 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2342 #endif
2343 
2344 #if MACH_ASSERT
2345 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2346 	    &vm_footprint_suspend_allowed,
2347 	    sizeof(vm_footprint_suspend_allowed));
2348 #endif /* MACH_ASSERT */
2349 
2350 #if KASAN
2351 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2352 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2353 #endif /* KASAN */
2354 
2355 	/**
2356 	 * Ensure that avail_start is always left on a page boundary. The calling
2357 	 * code might not perform any alignment before allocating page tables so
2358 	 * this is important.
2359 	 */
2360 	avail_start = round_page(avail_start);
2361 }
2362 
2363 #if XNU_MONITOR
2364 
2365 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2366 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2367 {
2368 	pmap_paddr_t cur_pa;
2369 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2370 		assert(pa_valid(cur_pa));
2371 		ppattr_pa_set_monitor(cur_pa);
2372 	}
2373 }
2374 
2375 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2376 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2377     pmap_paddr_t end_pa,
2378     unsigned int expected_perm,
2379     unsigned int new_perm)
2380 {
2381 	vm_offset_t start_va = phystokv(start_pa);
2382 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2383 
2384 	pa_set_range_monitor(start_pa, end_pa);
2385 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2386 }
2387 
2388 static void
pmap_lockdown_kc(void)2389 pmap_lockdown_kc(void)
2390 {
2391 	extern vm_offset_t vm_kernelcache_base;
2392 	extern vm_offset_t vm_kernelcache_top;
2393 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2394 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2395 	pmap_paddr_t cur_pa = start_pa;
2396 	vm_offset_t cur_va = vm_kernelcache_base;
2397 	while (cur_pa < end_pa) {
2398 		vm_size_t range_size = end_pa - cur_pa;
2399 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2400 		if (ptov_va != cur_va) {
2401 			/*
2402 			 * If the physical address maps back to a virtual address that is non-linear
2403 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2404 			 * reclaimed by the OS and should therefore not be locked down.
2405 			 */
2406 			cur_pa += range_size;
2407 			cur_va += range_size;
2408 			continue;
2409 		}
2410 		unsigned int pai = pa_index(cur_pa);
2411 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2412 
2413 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2414 
2415 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2416 			panic("pai %d already locked down", pai);
2417 		}
2418 
2419 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2420 		cur_pa += ARM_PGBYTES;
2421 		cur_va += ARM_PGBYTES;
2422 	}
2423 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2424 	extern uint64_t ctrr_ro_test;
2425 	extern uint64_t ctrr_nx_test;
2426 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2427 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2428 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2429 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2430 	}
2431 #endif
2432 }
2433 
2434 void
pmap_static_allocations_done(void)2435 pmap_static_allocations_done(void)
2436 {
2437 	pmap_paddr_t monitor_start_pa;
2438 	pmap_paddr_t monitor_end_pa;
2439 
2440 	/*
2441 	 * Protect the bootstrap (V=P and V->P) page tables.
2442 	 *
2443 	 * These bootstrap allocations will be used primarily for page tables.
2444 	 * If we wish to secure the page tables, we need to start by marking
2445 	 * these bootstrap allocations as pages that we want to protect.
2446 	 */
2447 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2448 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2449 
2450 	/* The bootstrap page tables are mapped RW at boostrap. */
2451 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2452 
2453 	/*
2454 	 * We use avail_start as a pointer to the first address that has not
2455 	 * been reserved for bootstrap, so we know which pages to give to the
2456 	 * virtual memory layer.
2457 	 */
2458 	monitor_start_pa = first_avail_phys;
2459 	monitor_end_pa = avail_start;
2460 
2461 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2462 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2463 
2464 	/*
2465 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2466 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2467 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2468 	 * they can't be allocated for other uses.  We don't need a special xPRR
2469 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2470 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2471 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2472 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2473 	 * to believe we are dealing with an user XO page upon performing a translation.
2474 	 */
2475 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2476 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2477 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2478 
2479 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2480 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2481 
2482 	/* PPL data is RW for the PPL, RO for the kernel. */
2483 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2484 
2485 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2486 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2487 
2488 	/* PPL text is RX for the PPL, RO for the kernel. */
2489 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2490 
2491 
2492 	/*
2493 	 * In order to support DTrace, the save areas for the PPL must be
2494 	 * writable.  This is due to the fact that DTrace will try to update
2495 	 * register state.
2496 	 */
2497 	if (pmap_ppl_disable) {
2498 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2499 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2500 
2501 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2502 	}
2503 
2504 
2505 	if (segSizePPLDATACONST > 0) {
2506 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2507 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2508 
2509 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2510 	}
2511 
2512 	/*
2513 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2514 	 * precaution.  The real RW mappings are at a different location with guard pages.
2515 	 */
2516 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2517 
2518 	/* Prevent remapping of the kernelcache */
2519 	pmap_lockdown_kc();
2520 }
2521 
2522 void
pmap_lockdown_ppl(void)2523 pmap_lockdown_ppl(void)
2524 {
2525 	/* Mark the PPL as being locked down. */
2526 
2527 	mp_disable_preemption(); // for _nopreempt locking operations
2528 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2529 	if (commpage_text_kva != 0) {
2530 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2531 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2532 	}
2533 	mp_enable_preemption();
2534 
2535 	/* Write-protect the kernel RO commpage. */
2536 #error "XPRR configuration error"
2537 }
2538 #endif /* XNU_MONITOR */
2539 
2540 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2541 pmap_virtual_space(
2542 	vm_offset_t *startp,
2543 	vm_offset_t *endp
2544 	)
2545 {
2546 	*startp = virtual_space_start;
2547 	*endp = virtual_space_end;
2548 }
2549 
2550 
2551 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2552 pmap_virtual_region(
2553 	unsigned int region_select,
2554 	vm_map_offset_t *startp,
2555 	vm_map_size_t *size
2556 	)
2557 {
2558 	boolean_t       ret = FALSE;
2559 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2560 	if (region_select == 0) {
2561 		/*
2562 		 * In this config, the bootstrap mappings should occupy their own L2
2563 		 * TTs, as they should be immutable after boot.  Having the associated
2564 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2565 		 * while allowing the rest of the kernel address range to be remapped.
2566 		 */
2567 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2568 #if defined(ARM_LARGE_MEMORY)
2569 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2570 #else
2571 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2572 #endif
2573 		ret = TRUE;
2574 	}
2575 
2576 #if defined(ARM_LARGE_MEMORY)
2577 	if (region_select == 1) {
2578 		*startp = VREGION1_START;
2579 		*size = VREGION1_SIZE;
2580 		ret = TRUE;
2581 	}
2582 #endif
2583 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2584 #if defined(ARM_LARGE_MEMORY)
2585 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2586 	if (region_select == 0) {
2587 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2588 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2589 		ret = TRUE;
2590 	}
2591 
2592 	if (region_select == 1) {
2593 		*startp = VREGION1_START;
2594 		*size = VREGION1_SIZE;
2595 		ret = TRUE;
2596 	}
2597 #else /* !defined(ARM_LARGE_MEMORY) */
2598 	unsigned long low_global_vr_mask = 0;
2599 	vm_map_size_t low_global_vr_size = 0;
2600 
2601 	if (region_select == 0) {
2602 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2603 		if (!TEST_PAGE_SIZE_4K) {
2604 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2605 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2606 		} else {
2607 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2608 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2609 		}
2610 		ret = TRUE;
2611 	}
2612 	if (region_select == 1) {
2613 		*startp = VREGION1_START;
2614 		*size = VREGION1_SIZE;
2615 		ret = TRUE;
2616 	}
2617 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2618 	if (!TEST_PAGE_SIZE_4K) {
2619 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2620 		low_global_vr_size = 0x2000000;
2621 	} else {
2622 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2623 		low_global_vr_size = 0x800000;
2624 	}
2625 
2626 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2627 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2628 		*size = low_global_vr_size;
2629 		ret = TRUE;
2630 	}
2631 
2632 	if (region_select == 3) {
2633 		/* In this config, we allow the bootstrap mappings to occupy the same
2634 		 * page table pages as the heap.
2635 		 */
2636 		*startp = VM_MIN_KERNEL_ADDRESS;
2637 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2638 		ret = TRUE;
2639 	}
2640 #endif /* defined(ARM_LARGE_MEMORY) */
2641 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2642 	return ret;
2643 }
2644 
2645 /*
2646  * Routines to track and allocate physical pages during early boot.
2647  * On most systems that memory runs from first_avail through to avail_end
2648  * with no gaps.
2649  *
2650  * If the system supports ECC and ecc_bad_pages_count > 0, we
2651  * need to skip those pages.
2652  */
2653 
2654 static unsigned int avail_page_count = 0;
2655 static bool need_ram_ranges_init = true;
2656 
2657 
2658 /**
2659  * Checks to see if a given page is in
2660  * the array of known bad pages
2661  *
2662  * @param ppn page number to check
2663  */
2664 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2665 pmap_is_bad_ram(__unused ppnum_t ppn)
2666 {
2667 	return false;
2668 }
2669 
2670 /**
2671  * Prepare bad ram pages to be skipped.
2672  */
2673 
2674 /*
2675  * Initialize the count of available pages. No lock needed here,
2676  * as this code is called while kernel boot up is single threaded.
2677  */
2678 static void
initialize_ram_ranges(void)2679 initialize_ram_ranges(void)
2680 {
2681 	pmap_paddr_t first = first_avail;
2682 	pmap_paddr_t end = avail_end;
2683 
2684 	assert(first <= end);
2685 	assert(first == (first & ~PAGE_MASK));
2686 	assert(end == (end & ~PAGE_MASK));
2687 	avail_page_count = atop(end - first);
2688 
2689 	need_ram_ranges_init = false;
2690 }
2691 
2692 unsigned int
pmap_free_pages(void)2693 pmap_free_pages(
2694 	void)
2695 {
2696 	if (need_ram_ranges_init) {
2697 		initialize_ram_ranges();
2698 	}
2699 	return avail_page_count;
2700 }
2701 
2702 unsigned int
pmap_free_pages_span(void)2703 pmap_free_pages_span(
2704 	void)
2705 {
2706 	if (need_ram_ranges_init) {
2707 		initialize_ram_ranges();
2708 	}
2709 	return (unsigned int)atop(avail_end - first_avail);
2710 }
2711 
2712 
2713 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2714 pmap_next_page_hi(
2715 	ppnum_t            * pnum,
2716 	__unused boolean_t might_free)
2717 {
2718 	return pmap_next_page(pnum);
2719 }
2720 
2721 
2722 boolean_t
pmap_next_page(ppnum_t * pnum)2723 pmap_next_page(
2724 	ppnum_t *pnum)
2725 {
2726 	if (need_ram_ranges_init) {
2727 		initialize_ram_ranges();
2728 	}
2729 
2730 
2731 	if (first_avail != avail_end) {
2732 		*pnum = (ppnum_t)atop(first_avail);
2733 		first_avail += PAGE_SIZE;
2734 		assert(avail_page_count > 0);
2735 		--avail_page_count;
2736 		return TRUE;
2737 	}
2738 	assert(avail_page_count == 0);
2739 	return FALSE;
2740 }
2741 
2742 
2743 /*
2744  *	Initialize the pmap module.
2745  *	Called by vm_init, to initialize any structures that the pmap
2746  *	system needs to map virtual memory.
2747  */
2748 void
pmap_init(void)2749 pmap_init(
2750 	void)
2751 {
2752 	/*
2753 	 *	Protect page zero in the kernel map.
2754 	 *	(can be overruled by permanent transltion
2755 	 *	table entries at page zero - see arm_vm_init).
2756 	 */
2757 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2758 
2759 	pmap_initialized = TRUE;
2760 
2761 	/*
2762 	 *	Create the zone of physical maps
2763 	 *	and the physical-to-virtual entries.
2764 	 */
2765 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2766 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2767 
2768 
2769 	/*
2770 	 *	Initialize the pmap object (for tracking the vm_page_t
2771 	 *	structures for pages we allocate to be page tables in
2772 	 *	pmap_expand().
2773 	 */
2774 	_vm_object_allocate(mem_size, pmap_object);
2775 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2776 
2777 	/*
2778 	 * The values of [hard_]maxproc may have been scaled, make sure
2779 	 * they are still less than the value of pmap_max_asids.
2780 	 */
2781 	if ((uint32_t)maxproc > pmap_max_asids) {
2782 		maxproc = pmap_max_asids;
2783 	}
2784 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2785 		hard_maxproc = pmap_max_asids;
2786 	}
2787 }
2788 
2789 /**
2790  * Verify that a given physical page contains no mappings (outside of the
2791  * default physical aperture mapping).
2792  *
2793  * @param ppnum Physical page number to check there are no mappings to.
2794  *
2795  * @return True if there are no mappings, false otherwise or if the page is not
2796  *         kernel-managed.
2797  */
2798 bool
pmap_verify_free(ppnum_t ppnum)2799 pmap_verify_free(ppnum_t ppnum)
2800 {
2801 	const pmap_paddr_t pa = ptoa(ppnum);
2802 
2803 	assert(pa != vm_page_fictitious_addr);
2804 
2805 	/* Only mappings to kernel-managed physical memory are tracked. */
2806 	if (!pa_valid(pa)) {
2807 		return false;
2808 	}
2809 
2810 	const unsigned int pai = pa_index(pa);
2811 	pv_entry_t **pvh = pai_to_pvh(pai);
2812 
2813 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2814 }
2815 
2816 #if MACH_ASSERT
2817 /**
2818  * Verify that a given physical page contains no mappings (outside of the
2819  * default physical aperture mapping) and if it does, then panic.
2820  *
2821  * @note It's recommended to use pmap_verify_free() directly when operating in
2822  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2823  *       normally being called from outside of the PPL, and the pv_head_table
2824  *       can't be modified outside of the PPL).
2825  *
2826  * @param ppnum Physical page number to check there are no mappings to.
2827  */
2828 void
pmap_assert_free(ppnum_t ppnum)2829 pmap_assert_free(ppnum_t ppnum)
2830 {
2831 	const pmap_paddr_t pa = ptoa(ppnum);
2832 
2833 	/* Only mappings to kernel-managed physical memory are tracked. */
2834 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2835 		return;
2836 	}
2837 
2838 	const unsigned int pai = pa_index(pa);
2839 	pv_entry_t **pvh = pai_to_pvh(pai);
2840 
2841 	/**
2842 	 * This function is always called from outside of the PPL. Because of this,
2843 	 * the PVH entry can't be locked. This function is generally only called
2844 	 * before the VM reclaims a physical page and shouldn't be creating new
2845 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2846 	 * the worst case is that the system will panic in another way, and we were
2847 	 * already about to panic anyway.
2848 	 */
2849 
2850 	/**
2851 	 * Since pmap_verify_free() returned false, that means there is at least one
2852 	 * mapping left. Let's get some extra info on the first mapping we find to
2853 	 * dump in the panic string (the common case is that there is one spare
2854 	 * mapping that was never unmapped).
2855 	 */
2856 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2857 
2858 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2859 		first_ptep = pvh_ptep(pvh);
2860 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2861 		pv_entry_t *pvep = pvh_pve_list(pvh);
2862 
2863 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2864 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2865 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2866 			if (first_ptep != PT_ENTRY_NULL) {
2867 				break;
2868 			}
2869 		}
2870 
2871 		/* The PVE should have at least one valid PTE. */
2872 		assert(first_ptep != PT_ENTRY_NULL);
2873 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2874 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2875 		    __func__, pvh, pai);
2876 	} else {
2877 		/**
2878 		 * The mapping disappeared between here and the pmap_verify_free() call.
2879 		 * The only way that can happen is if the VM was racing this call with
2880 		 * a call that unmaps PTEs. Operations on this page should not be
2881 		 * occurring at the same time as this check, and unfortunately we can't
2882 		 * lock the PVH entry to prevent it, so just panic instead.
2883 		 */
2884 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2885 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2886 		    __func__, pvh, pai);
2887 	}
2888 
2889 	/* Panic with a unique string identifying the first bad mapping and owner. */
2890 	{
2891 		/* First PTE is mapped by the main CPUs. */
2892 		pmap_t pmap = ptep_get_pmap(first_ptep);
2893 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2894 
2895 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2896 		    "%s CPU mapping (pmap: %p)",
2897 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2898 	}
2899 }
2900 #endif
2901 
2902 
2903 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2904 pmap_root_alloc_size(pmap_t pmap)
2905 {
2906 #pragma unused(pmap)
2907 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2908 	unsigned int root_level = pt_attr_root_level(pt_attr);
2909 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2910 }
2911 
2912 
2913 /*
2914  *	Create and return a physical map.
2915  *
2916  *	If the size specified for the map
2917  *	is zero, the map is an actual physical
2918  *	map, and may be referenced by the
2919  *	hardware.
2920  *
2921  *	If the size specified is non-zero,
2922  *	the map will be used in software only, and
2923  *	is bounded by that size.
2924  */
2925 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2926 pmap_create_options_internal(
2927 	ledger_t ledger,
2928 	vm_map_size_t size,
2929 	unsigned int flags,
2930 	kern_return_t *kr)
2931 {
2932 	unsigned        i;
2933 	unsigned        tte_index_max;
2934 	pmap_t          p;
2935 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2936 #if defined(HAS_APPLE_PAC)
2937 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2938 #endif /* defined(HAS_APPLE_PAC) */
2939 	kern_return_t   local_kr = KERN_SUCCESS;
2940 
2941 	if (size != 0) {
2942 		{
2943 			// Size parameter should only be set for stage 2.
2944 			return PMAP_NULL;
2945 		}
2946 	}
2947 
2948 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2949 		return PMAP_NULL;
2950 	}
2951 
2952 #if XNU_MONITOR
2953 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2954 		goto pmap_create_fail;
2955 	}
2956 
2957 	assert(p != PMAP_NULL);
2958 
2959 	if (ledger) {
2960 		pmap_ledger_validate(ledger);
2961 		pmap_ledger_retain(ledger);
2962 	}
2963 #else
2964 	/*
2965 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
2966 	 *	the translation table of the right size for the pmap.
2967 	 */
2968 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2969 		local_kr = KERN_RESOURCE_SHORTAGE;
2970 		goto pmap_create_fail;
2971 	}
2972 #endif
2973 
2974 	p->ledger = ledger;
2975 
2976 
2977 	p->pmap_vm_map_cs_enforced = false;
2978 	p->min = 0;
2979 
2980 
2981 #if CONFIG_ROSETTA
2982 	if (flags & PMAP_CREATE_ROSETTA) {
2983 		p->is_rosetta = TRUE;
2984 	} else {
2985 		p->is_rosetta = FALSE;
2986 	}
2987 #endif /* CONFIG_ROSETTA */
2988 
2989 #if defined(HAS_APPLE_PAC)
2990 	p->disable_jop = disable_jop;
2991 #endif /* defined(HAS_APPLE_PAC) */
2992 
2993 	p->nested_region_true_start = 0;
2994 	p->nested_region_true_end = ~0;
2995 
2996 	p->nx_enabled = true;
2997 	p->is_64bit = is_64bit;
2998 	p->nested_pmap = PMAP_NULL;
2999 	p->type = PMAP_TYPE_USER;
3000 
3001 #if ARM_PARAMETERIZED_PMAP
3002 	/* Default to the native pt_attr */
3003 	p->pmap_pt_attr = native_pt_attr;
3004 #endif /* ARM_PARAMETERIZED_PMAP */
3005 #if __ARM_MIXED_PAGE_SIZE__
3006 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3007 		p->pmap_pt_attr = &pmap_pt_attr_4k;
3008 	}
3009 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3010 	p->max = pmap_user_va_size(p);
3011 
3012 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3013 		local_kr = KERN_NO_SPACE;
3014 		goto id_alloc_fail;
3015 	}
3016 
3017 	pmap_lock_init(p);
3018 
3019 	p->tt_entry_free = (tt_entry_t *)0;
3020 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3021 
3022 
3023 #if XNU_MONITOR
3024 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3025 #else
3026 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3027 #endif
3028 	if (!(p->tte)) {
3029 		local_kr = KERN_RESOURCE_SHORTAGE;
3030 		goto tt1_alloc_fail;
3031 	}
3032 
3033 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3034 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3035 
3036 	/* nullify the translation table */
3037 	for (i = 0; i < tte_index_max; i++) {
3038 		p->tte[i] = ARM_TTE_TYPE_FAULT;
3039 	}
3040 
3041 	FLUSH_PTE();
3042 
3043 	/*
3044 	 *  initialize the rest of the structure
3045 	 */
3046 	p->nested_region_addr = 0x0ULL;
3047 	p->nested_region_size = 0x0ULL;
3048 	p->nested_region_unnested_table_bitmap = NULL;
3049 	p->nested_region_unnested_table_bitmap_size = 0x0UL;
3050 
3051 	p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3052 	p->nested_no_bounds_refcnt = 0;
3053 	p->nested_bounds_set = false;
3054 
3055 
3056 #if MACH_ASSERT
3057 	p->pmap_pid = 0;
3058 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3059 #endif /* MACH_ASSERT */
3060 #if DEVELOPMENT || DEBUG
3061 	p->footprint_was_suspended = FALSE;
3062 #endif /* DEVELOPMENT || DEBUG */
3063 
3064 #if XNU_MONITOR
3065 	os_atomic_init(&p->nested_count, 0);
3066 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3067 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3068 	os_atomic_thread_fence(release);
3069 #endif
3070 	os_atomic_init(&p->ref_count, 1);
3071 	pmap_simple_lock(&pmaps_lock);
3072 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3073 	pmap_simple_unlock(&pmaps_lock);
3074 
3075 	/*
3076 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3077 	 * which can lead to a concurrent disconnect operation making the balance
3078 	 * transiently negative.  The ledger should still ultimately balance out,
3079 	 * which we still check upon pmap destruction.
3080 	 */
3081 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3082 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3083 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3084 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3085 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3086 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3087 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3088 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3089 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3090 
3091 	return p;
3092 
3093 tt1_alloc_fail:
3094 	pmap_get_pt_ops(p)->free_id(p);
3095 id_alloc_fail:
3096 #if XNU_MONITOR
3097 	pmap_free_pmap(p);
3098 
3099 	if (ledger) {
3100 		pmap_ledger_release(ledger);
3101 	}
3102 #else
3103 	zfree(pmap_zone, p);
3104 #endif
3105 pmap_create_fail:
3106 #if XNU_MONITOR
3107 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3108 #endif
3109 	*kr = local_kr;
3110 #if XNU_MONITOR
3111 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3112 #endif
3113 	return PMAP_NULL;
3114 }
3115 
3116 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3117 pmap_create_options(
3118 	ledger_t ledger,
3119 	vm_map_size_t size,
3120 	unsigned int flags)
3121 {
3122 	pmap_t pmap;
3123 	kern_return_t kr = KERN_SUCCESS;
3124 
3125 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3126 
3127 	ledger_reference(ledger);
3128 
3129 #if XNU_MONITOR
3130 	for (;;) {
3131 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3132 		if (kr != KERN_RESOURCE_SHORTAGE) {
3133 			break;
3134 		}
3135 		assert(pmap == PMAP_NULL);
3136 		pmap_alloc_page_for_ppl(0);
3137 		kr = KERN_SUCCESS;
3138 	}
3139 #else
3140 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3141 #endif
3142 
3143 	if (pmap == PMAP_NULL) {
3144 		ledger_dereference(ledger);
3145 	}
3146 
3147 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3148 
3149 	return pmap;
3150 }
3151 
3152 #if XNU_MONITOR
3153 /*
3154  * This symbol remains in place when the PPL is enabled so that the dispatch
3155  * table does not change from development to release configurations.
3156  */
3157 #endif
3158 #if MACH_ASSERT || XNU_MONITOR
3159 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3160 pmap_set_process_internal(
3161 	__unused pmap_t pmap,
3162 	__unused int pid,
3163 	__unused char *procname)
3164 {
3165 #if MACH_ASSERT
3166 	if (pmap == NULL || pmap->pmap_pid == -1) {
3167 		return;
3168 	}
3169 
3170 	validate_pmap_mutable(pmap);
3171 
3172 	pmap->pmap_pid = pid;
3173 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3174 #endif /* MACH_ASSERT */
3175 }
3176 #endif /* MACH_ASSERT || XNU_MONITOR */
3177 
3178 #if MACH_ASSERT
3179 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3180 pmap_set_process(
3181 	pmap_t pmap,
3182 	int pid,
3183 	char *procname)
3184 {
3185 #if XNU_MONITOR
3186 	pmap_set_process_ppl(pmap, pid, procname);
3187 #else
3188 	pmap_set_process_internal(pmap, pid, procname);
3189 #endif
3190 }
3191 #endif /* MACH_ASSERT */
3192 
3193 /*
3194  * pmap_deallocate_all_leaf_tts:
3195  *
3196  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3197  * removing and deallocating all TTEs.
3198  */
3199 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3200 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3201 {
3202 	tt_entry_t tte = ARM_TTE_EMPTY;
3203 	tt_entry_t * ttep = NULL;
3204 	tt_entry_t * last_ttep = NULL;
3205 
3206 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3207 
3208 	assert(level < pt_attr_leaf_level(pt_attr));
3209 
3210 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3211 
3212 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3213 		tte = *ttep;
3214 
3215 		if (!(tte & ARM_TTE_VALID)) {
3216 			continue;
3217 		}
3218 
3219 		if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3220 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3221 			    "pmap=%p, first_ttep=%p, level=%u",
3222 			    __FUNCTION__, ttep, (void *)tte,
3223 			    pmap, first_ttep, level);
3224 		}
3225 
3226 		/* Must be valid, type table */
3227 		if (level < pt_attr_twig_level(pt_attr)) {
3228 			/* If we haven't reached the twig level, recurse to the next level. */
3229 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3230 		}
3231 
3232 		/* Remove the TTE. */
3233 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3234 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3235 	}
3236 }
3237 
3238 /*
3239  * We maintain stats and ledgers so that a task's physical footprint is:
3240  * phys_footprint = ((internal - alternate_accounting)
3241  *                   + (internal_compressed - alternate_accounting_compressed)
3242  *                   + iokit_mapped
3243  *                   + purgeable_nonvolatile
3244  *                   + purgeable_nonvolatile_compressed
3245  *                   + page_table)
3246  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3247  */
3248 
3249 /*
3250  *	Retire the given physical map from service.
3251  *	Should only be called if the map contains
3252  *	no valid mappings.
3253  */
3254 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3255 pmap_destroy_internal(
3256 	pmap_t pmap)
3257 {
3258 	if (pmap == PMAP_NULL) {
3259 		return;
3260 	}
3261 
3262 	validate_pmap(pmap);
3263 
3264 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3265 
3266 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3267 	if (ref_count > 0) {
3268 		return;
3269 	} else if (__improbable(ref_count < 0)) {
3270 		panic("pmap %p: refcount underflow", pmap);
3271 	} else if (__improbable(pmap == kernel_pmap)) {
3272 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3273 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3274 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3275 	}
3276 
3277 #if XNU_MONITOR
3278 	/*
3279 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3280 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3281 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3282 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3283 	 * ref_count of 0 and panic.
3284 	 */
3285 	os_atomic_thread_fence(seq_cst);
3286 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3287 		panic("pmap %p: attempt to destroy while nested", pmap);
3288 	}
3289 	const int max_cpu = ml_get_max_cpu_number();
3290 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3291 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3292 		if (cpu_data == NULL) {
3293 			continue;
3294 		}
3295 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3296 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3297 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3298 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3299 		}
3300 	}
3301 #endif
3302 	pmap_unmap_commpage(pmap);
3303 
3304 	pmap_simple_lock(&pmaps_lock);
3305 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3306 	pmap_simple_unlock(&pmaps_lock);
3307 
3308 	pmap_trim_self(pmap);
3309 
3310 	/*
3311 	 *	Free the memory maps, then the
3312 	 *	pmap structure.
3313 	 */
3314 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3315 
3316 
3317 
3318 	if (pmap->tte) {
3319 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3320 		pmap->tte = (tt_entry_t *) NULL;
3321 		pmap->ttep = 0;
3322 	}
3323 
3324 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3325 
3326 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3327 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3328 		sync_tlb_flush();
3329 	} else {
3330 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3331 		sync_tlb_flush();
3332 		/* return its asid to the pool */
3333 		pmap_get_pt_ops(pmap)->free_id(pmap);
3334 		if (pmap->nested_pmap != NULL) {
3335 #if XNU_MONITOR
3336 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3337 #endif
3338 			/* release the reference we hold on the nested pmap */
3339 			pmap_destroy_internal(pmap->nested_pmap);
3340 		}
3341 	}
3342 
3343 	pmap_check_ledgers(pmap);
3344 
3345 	if (pmap->nested_region_unnested_table_bitmap) {
3346 #if XNU_MONITOR
3347 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3348 #else
3349 		kfree_data(pmap->nested_region_unnested_table_bitmap,
3350 		    pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3351 #endif
3352 	}
3353 
3354 #if XNU_MONITOR
3355 	if (pmap->ledger) {
3356 		pmap_ledger_release(pmap->ledger);
3357 	}
3358 
3359 	pmap_lock_destroy(pmap);
3360 	pmap_free_pmap(pmap);
3361 #else
3362 	pmap_lock_destroy(pmap);
3363 	zfree(pmap_zone, pmap);
3364 #endif
3365 }
3366 
3367 void
pmap_destroy(pmap_t pmap)3368 pmap_destroy(
3369 	pmap_t pmap)
3370 {
3371 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3372 
3373 	ledger_t ledger = pmap->ledger;
3374 
3375 #if XNU_MONITOR
3376 	pmap_destroy_ppl(pmap);
3377 
3378 	pmap_ledger_check_balance(pmap);
3379 #else
3380 	pmap_destroy_internal(pmap);
3381 #endif
3382 
3383 	ledger_dereference(ledger);
3384 
3385 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3386 }
3387 
3388 
3389 /*
3390  *	Add a reference to the specified pmap.
3391  */
3392 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3393 pmap_reference_internal(
3394 	pmap_t pmap)
3395 {
3396 	if (pmap != PMAP_NULL) {
3397 		validate_pmap_mutable(pmap);
3398 		os_atomic_inc(&pmap->ref_count, relaxed);
3399 	}
3400 }
3401 
3402 void
pmap_reference(pmap_t pmap)3403 pmap_reference(
3404 	pmap_t pmap)
3405 {
3406 #if XNU_MONITOR
3407 	pmap_reference_ppl(pmap);
3408 #else
3409 	pmap_reference_internal(pmap);
3410 #endif
3411 }
3412 
3413 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3414 pmap_tt1_allocate(
3415 	pmap_t          pmap,
3416 	vm_size_t       size,
3417 	unsigned        option)
3418 {
3419 	tt_entry_t      *tt1 = NULL;
3420 	tt_free_entry_t *tt1_free;
3421 	pmap_paddr_t    pa;
3422 	vm_address_t    va;
3423 	vm_address_t    va_end;
3424 	kern_return_t   ret;
3425 
3426 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3427 		size = PAGE_SIZE;
3428 	}
3429 
3430 	pmap_simple_lock(&tt1_lock);
3431 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3432 		free_page_size_tt_count--;
3433 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3434 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3435 	} else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3436 		free_two_page_size_tt_count--;
3437 		tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3438 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3439 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3440 		free_tt_count--;
3441 		tt1 = (tt_entry_t *)free_tt_list;
3442 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3443 	}
3444 
3445 	pmap_simple_unlock(&tt1_lock);
3446 
3447 	if (tt1 != NULL) {
3448 		pmap_tt_ledger_credit(pmap, size);
3449 		return (tt_entry_t *)tt1;
3450 	}
3451 
3452 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3453 
3454 	if (ret == KERN_RESOURCE_SHORTAGE) {
3455 		return (tt_entry_t *)0;
3456 	}
3457 
3458 #if XNU_MONITOR
3459 	assert(pa);
3460 #endif
3461 
3462 	if (size < PAGE_SIZE) {
3463 		va = phystokv(pa) + size;
3464 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3465 		tt_free_entry_t *next_free = NULL;
3466 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3467 			tt1_free = (tt_free_entry_t *)va;
3468 			tt1_free->next = next_free;
3469 			next_free = tt1_free;
3470 		}
3471 		pmap_simple_lock(&tt1_lock);
3472 		local_free_list->next = free_tt_list;
3473 		free_tt_list = next_free;
3474 		free_tt_count += ((PAGE_SIZE / size) - 1);
3475 		if (free_tt_count > free_tt_max) {
3476 			free_tt_max = free_tt_count;
3477 		}
3478 		pmap_simple_unlock(&tt1_lock);
3479 	}
3480 
3481 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3482 	 * Depending on the device, this can vary between 512b and 16K. */
3483 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3484 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3485 	pmap_tt_ledger_credit(pmap, size);
3486 
3487 	return (tt_entry_t *) phystokv(pa);
3488 }
3489 
3490 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3491 pmap_tt1_deallocate(
3492 	pmap_t pmap,
3493 	tt_entry_t *tt,
3494 	vm_size_t size,
3495 	unsigned option)
3496 {
3497 	tt_free_entry_t *tt_entry;
3498 
3499 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3500 		size = PAGE_SIZE;
3501 	}
3502 
3503 	tt_entry = (tt_free_entry_t *)tt;
3504 	assert(not_in_kdp);
3505 	pmap_simple_lock(&tt1_lock);
3506 
3507 	if (size < PAGE_SIZE) {
3508 		free_tt_count++;
3509 		if (free_tt_count > free_tt_max) {
3510 			free_tt_max = free_tt_count;
3511 		}
3512 		tt_entry->next = free_tt_list;
3513 		free_tt_list = tt_entry;
3514 	}
3515 
3516 	if (size == PAGE_SIZE) {
3517 		free_page_size_tt_count++;
3518 		if (free_page_size_tt_count > free_page_size_tt_max) {
3519 			free_page_size_tt_max = free_page_size_tt_count;
3520 		}
3521 		tt_entry->next = free_page_size_tt_list;
3522 		free_page_size_tt_list = tt_entry;
3523 	}
3524 
3525 	if (size == 2 * PAGE_SIZE) {
3526 		free_two_page_size_tt_count++;
3527 		if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3528 			free_two_page_size_tt_max = free_two_page_size_tt_count;
3529 		}
3530 		tt_entry->next = free_two_page_size_tt_list;
3531 		free_two_page_size_tt_list = tt_entry;
3532 	}
3533 
3534 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3535 		pmap_simple_unlock(&tt1_lock);
3536 		pmap_tt_ledger_debit(pmap, size);
3537 		return;
3538 	}
3539 
3540 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3541 		free_page_size_tt_count--;
3542 		tt = (tt_entry_t *)free_page_size_tt_list;
3543 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3544 
3545 		pmap_simple_unlock(&tt1_lock);
3546 
3547 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3548 
3549 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3550 
3551 		pmap_simple_lock(&tt1_lock);
3552 	}
3553 
3554 	while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3555 		free_two_page_size_tt_count--;
3556 		tt = (tt_entry_t *)free_two_page_size_tt_list;
3557 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3558 
3559 		pmap_simple_unlock(&tt1_lock);
3560 
3561 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3562 
3563 		OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3564 
3565 		pmap_simple_lock(&tt1_lock);
3566 	}
3567 	pmap_simple_unlock(&tt1_lock);
3568 	pmap_tt_ledger_debit(pmap, size);
3569 }
3570 
3571 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3572 pmap_tt_allocate(
3573 	pmap_t pmap,
3574 	tt_entry_t **ttp,
3575 	unsigned int level,
3576 	unsigned int options)
3577 {
3578 	pmap_paddr_t pa;
3579 	*ttp = NULL;
3580 
3581 	/* Traverse the tt_entry_free list to find a free tt_entry */
3582 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3583 		return KERN_ABORTED;
3584 	}
3585 
3586 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3587 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3588 
3589 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3590 		tt_free_next = tt_free_cur->next;
3591 		tt_free_cur->next = NULL;
3592 		*ttp = (tt_entry_t *)tt_free_cur;
3593 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3594 	}
3595 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3596 
3597 	/* Only do the heavylifting here when we don't have a free tt_entry. */
3598 	if (*ttp == NULL) {
3599 		pt_desc_t       *ptdp;
3600 
3601 		/*
3602 		 *  Allocate a VM page for the level x page table entries.
3603 		 */
3604 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3605 			if (options & PMAP_OPTIONS_NOWAIT) {
3606 				return KERN_RESOURCE_SHORTAGE;
3607 			}
3608 			VM_PAGE_WAIT();
3609 		}
3610 
3611 		/* Allocate a new Page Table Descriptor for the newly allocated page table. */
3612 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3613 			if (options & PMAP_OPTIONS_NOWAIT) {
3614 				/* Deallocate all allocated resources so far. */
3615 				pmap_pages_free(pa, PAGE_SIZE);
3616 				return KERN_RESOURCE_SHORTAGE;
3617 			}
3618 			VM_PAGE_WAIT();
3619 		}
3620 
3621 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3622 			OSAddAtomic64(1, &alloc_ttepages_count);
3623 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3624 		} else {
3625 			OSAddAtomic64(1, &alloc_ptepages_count);
3626 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3627 		}
3628 
3629 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3630 
3631 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3632 
3633 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3634 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3635 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3636 
3637 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3638 		if (PAGE_SIZE > pmap_page_size) {
3639 			vm_address_t    va;
3640 			vm_address_t    va_end;
3641 
3642 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3643 				/* Deallocate all allocated resources so far. */
3644 				pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3645 				PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3646 				pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3647 				pmap_pages_free(pa, PAGE_SIZE);
3648 				ptd_deallocate(ptdp);
3649 
3650 				return KERN_ABORTED;
3651 			}
3652 
3653 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3654 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3655 				pmap->tt_entry_free = (tt_entry_t *)va;
3656 			}
3657 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3658 		}
3659 
3660 		*ttp = (tt_entry_t *)phystokv(pa);
3661 	}
3662 
3663 #if XNU_MONITOR
3664 	assert(*ttp);
3665 #endif
3666 
3667 	return KERN_SUCCESS;
3668 }
3669 
3670 
3671 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3672 pmap_tt_deallocate(
3673 	pmap_t pmap,
3674 	tt_entry_t *ttp,
3675 	unsigned int level)
3676 {
3677 	pt_desc_t *ptdp;
3678 	ptd_info_t *ptd_info;
3679 	unsigned pt_acc_cnt;
3680 	unsigned i;
3681 	vm_offset_t     free_page = 0;
3682 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3683 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3684 
3685 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3686 
3687 	ptdp = ptep_get_ptd(ttp);
3688 	ptd_info = ptd_get_info(ptdp, ttp);
3689 
3690 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3691 
3692 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3693 		ptd_info->refcnt = 0;
3694 	}
3695 
3696 	if (__improbable(ptd_info->refcnt != 0)) {
3697 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3698 	}
3699 
3700 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3701 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3702 	}
3703 
3704 	if (pt_acc_cnt == 0) {
3705 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3706 		unsigned pt_free_entry_cnt = 1;
3707 
3708 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3709 			tt_free_entry_t *tt_free_list_next;
3710 
3711 			tt_free_list_next = tt_free_list->next;
3712 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3713 				pt_free_entry_cnt++;
3714 			}
3715 			tt_free_list = tt_free_list_next;
3716 		}
3717 		if (pt_free_entry_cnt == max_pt_index) {
3718 			tt_free_entry_t *tt_free_list_cur;
3719 
3720 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3721 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3722 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3723 
3724 			while (tt_free_list_cur) {
3725 				tt_free_entry_t *tt_free_list_next;
3726 
3727 				tt_free_list_next = tt_free_list_cur->next;
3728 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3729 					tt_free_list->next = tt_free_list_next->next;
3730 				} else {
3731 					tt_free_list = tt_free_list_next;
3732 				}
3733 				tt_free_list_cur = tt_free_list_next;
3734 			}
3735 		} else {
3736 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3737 			pmap->tt_entry_free = ttp;
3738 		}
3739 	} else {
3740 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3741 		pmap->tt_entry_free = ttp;
3742 	}
3743 
3744 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3745 
3746 	if (free_page != 0) {
3747 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3748 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3749 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3750 		if (level < pt_attr_leaf_level(pt_attr)) {
3751 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3752 		} else {
3753 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3754 		}
3755 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3756 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3757 	}
3758 }
3759 
3760 /**
3761  * Safely clear out a translation table entry.
3762  *
3763  * @note If the TTE to clear out points to a leaf table, then that leaf table
3764  *       must have a refcnt of zero before the TTE can be removed.
3765  * @note This function expects to be called with pmap locked exclusive, and will
3766  *       return with pmap unlocked.
3767  *
3768  * @param pmap The pmap containing the page table whose TTE is being removed.
3769  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3770  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3771  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3772  * @param ttep Pointer to the TTE that should be cleared out.
3773  * @param level The level of the page table that contains the TTE to be removed.
3774  */
3775 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3776 pmap_tte_remove(
3777 	pmap_t pmap,
3778 	vm_offset_t va_start,
3779 	vm_offset_t va_end,
3780 	bool need_strong_sync,
3781 	tt_entry_t *ttep,
3782 	unsigned int level)
3783 {
3784 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3785 
3786 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3787 	const tt_entry_t tte = *ttep;
3788 
3789 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3790 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3791 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3792 	}
3793 
3794 	*ttep = (tt_entry_t) 0;
3795 	FLUSH_PTE_STRONG();
3796 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3797 	if (va_end > va_start) {
3798 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3799 	}
3800 
3801 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3802 
3803 	/**
3804 	 * Remember, the passed in "level" parameter refers to the level above the
3805 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3806 	 * page table).
3807 	 */
3808 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3809 
3810 	/**
3811 	 * Non-leaf pagetables don't track active references in the PTD and instead
3812 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3813 	 * the real refcount below.
3814 	 */
3815 	unsigned short refcnt = PT_DESC_REFCOUNT;
3816 
3817 	/*
3818 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3819 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3820 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3821 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3822 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3823 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3824 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3825 	 * synchronize it against the disconnect operation.  If that removal caused the
3826 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3827 	 * operation is finished using the relevant pagetable descriptor.
3828 	 * Address these cases by waiting until all CPUs have been observed to not be
3829 	 * executing pmap_disconnect().
3830 	 */
3831 	if (remove_leaf_table) {
3832 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3833 		const int max_cpu = ml_get_max_cpu_number();
3834 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3835 		bool inflight_disconnect;
3836 
3837 		/*
3838 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3839 		 * ahead of any prior PTE load which may have observed the effect of a
3840 		 * concurrent disconnect operation.  An acquire fence is required for this;
3841 		 * a load-acquire operation is insufficient.
3842 		 */
3843 		os_atomic_thread_fence(acquire);
3844 		do {
3845 			inflight_disconnect = false;
3846 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3847 			    i >= 0;
3848 			    i = bitmap_next(&active_disconnects[0], i)) {
3849 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3850 				if (cpu_data == NULL) {
3851 					continue;
3852 				}
3853 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3854 					__builtin_arm_wfe();
3855 					inflight_disconnect = true;
3856 					continue;
3857 				}
3858 				os_atomic_clear_exclusive();
3859 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3860 			}
3861 		} while (inflight_disconnect);
3862 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3863 		os_atomic_thread_fence(acquire);
3864 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3865 	}
3866 
3867 #if MACH_ASSERT
3868 	/**
3869 	 * On internal devices, always do the page table consistency check
3870 	 * regardless of page table level or the actual refcnt value.
3871 	 */
3872 	{
3873 #else /* MACH_ASSERT */
3874 	/**
3875 	 * Only perform the page table consistency check when deleting leaf page
3876 	 * tables and it seems like there might be valid/compressed mappings
3877 	 * leftover.
3878 	 */
3879 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3880 #endif /* MACH_ASSERT */
3881 
3882 		/**
3883 		 * There are multiple problems that can arise as a non-zero refcnt:
3884 		 * 1. A bug in the refcnt management logic.
3885 		 * 2. A memory stomper or hardware failure.
3886 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3887 		 *    space before destroying a pmap.
3888 		 *
3889 		 * By looping over the page table and determining how many valid or
3890 		 * compressed entries there actually are, we can narrow down which of
3891 		 * these three cases is causing this panic. If the expected refcnt
3892 		 * (valid + compressed) and the actual refcnt don't match then the
3893 		 * problem is probably either a memory corruption issue (if the
3894 		 * non-empty entries don't match valid+compressed, that could also be a
3895 		 * sign of corruption) or refcnt management bug. Otherwise, there
3896 		 * actually are leftover mappings and the higher layers of xnu are
3897 		 * probably at fault.
3898 		 */
3899 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3900 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3901 
3902 		pt_entry_t *ptep = bpte;
3903 		unsigned short non_empty = 0, valid = 0, comp = 0;
3904 
3905 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3906 			/**
3907 			 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3908 			 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3909 			 * That's because it's possible for the 4-tuple PTE clear operation in
3910 			 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3911 			 * pmap_disconnect() to race each other in such a way that the compressed marker
3912 			 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3913 			 * This should be harmless as only the 1st PTE is used for accounting purposes,
3914 			 * but we don't want it to trip our internal checks here.
3915 			 */
3916 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3917 				if ((i % PAGE_RATIO) == 0) {
3918 					comp++;
3919 				} else {
3920 					continue;
3921 				}
3922 			} else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3923 				valid++;
3924 			}
3925 
3926 			/* Keep track of all non-empty entries to detect memory corruption. */
3927 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3928 				non_empty++;
3929 			}
3930 		}
3931 
3932 #if MACH_ASSERT
3933 		/**
3934 		 * On internal machines, panic whenever a page table getting deleted has
3935 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3936 		 * non-zero refcnt.
3937 		 */
3938 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3939 #else /* MACH_ASSERT */
3940 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3941 		{
3942 #endif /* MACH_ASSERT */
3943 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3944 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3945 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3946 		}
3947 	}
3948 }
3949 
3950 /**
3951  * Given a pointer to an entry within a `level` page table, delete the
3952  * page table at `level` + 1 that is represented by that entry. For instance,
3953  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3954  * contains the PA of the L3 table, and `level` would be "2".
3955  *
3956  * @note If the table getting deallocated is a leaf table, then that leaf table
3957  *       must have a refcnt of zero before getting deallocated. All other levels
3958  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3959  * @note This function expects to be called with pmap locked exclusive and will
3960  *       return with pmap unlocked.
3961  *
3962  * @param pmap The pmap that owns the page table to be deallocated.
3963  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3964  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3965  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3966  * @param ttep Pointer to the `level` TTE to remove.
3967  * @param level The level of the table that contains an entry pointing to the
3968  *              table to be removed. The deallocated page table will be a
3969  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
3970  *              deleted).
3971  */
3972 void
3973 pmap_tte_deallocate(
3974 	pmap_t pmap,
3975 	vm_offset_t va_start,
3976 	vm_offset_t va_end,
3977 	bool need_strong_sync,
3978 	tt_entry_t *ttep,
3979 	unsigned int level)
3980 {
3981 	tt_entry_t tte;
3982 
3983 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3984 
3985 	tte = *ttep;
3986 
3987 	if (tte_get_ptd(tte)->pmap != pmap) {
3988 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3989 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3990 	}
3991 
3992 	assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3993 	    __func__, ttep, (unsigned long long)tte);
3994 
3995 	/* pmap_tte_remove() will drop the pmap lock */
3996 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3997 
3998 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3999 }
4000 
4001 /*
4002  *	Remove a range of hardware page-table entries.
4003  *	The entries given are the first (inclusive)
4004  *	and last (exclusive) entries for the VM pages.
4005  *	The virtual address is the va for the first pte.
4006  *
4007  *	The pmap must be locked.
4008  *	If the pmap is not the kernel pmap, the range must lie
4009  *	entirely within one pte-page.  This is NOT checked.
4010  *	Assumes that the pte-page exists.
4011  *
4012  *	Returns the number of PTE changed
4013  */
4014 MARK_AS_PMAP_TEXT static int
4015 pmap_remove_range(
4016 	pmap_t pmap,
4017 	vm_map_address_t va,
4018 	pt_entry_t *bpte,
4019 	pt_entry_t *epte)
4020 {
4021 	bool need_strong_sync = false;
4022 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4023 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
4024 	if (num_changed > 0) {
4025 		PMAP_UPDATE_TLBS(pmap, va,
4026 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4027 	}
4028 	return num_changed;
4029 }
4030 
4031 
4032 #ifdef PVH_FLAG_EXEC
4033 
4034 /*
4035  *	Update the access protection bits of the physical aperture mapping for a page.
4036  *	This is useful, for example, in guranteeing that a verified executable page
4037  *	has no writable mappings anywhere in the system, including the physical
4038  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
4039  *	synchronization overhead in cases where the call to this function is
4040  *	guaranteed to be followed by other TLB operations.
4041  */
4042 void
4043 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4044 {
4045 #if __ARM_PTE_PHYSMAP__
4046 	pvh_assert_locked(pai);
4047 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4048 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4049 
4050 	pt_entry_t tmplate = *pte_p;
4051 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4052 		return;
4053 	}
4054 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4055 	if (tmplate & ARM_PTE_HINT_MASK) {
4056 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4057 		    __func__, pte_p, (void *)kva, tmplate);
4058 	}
4059 	write_pte_strong(pte_p, tmplate);
4060 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4061 	if (!flush_tlb_async) {
4062 		sync_tlb_flush();
4063 	}
4064 #endif
4065 }
4066 #endif /* defined(PVH_FLAG_EXEC) */
4067 
4068 
4069 
4070 MARK_AS_PMAP_TEXT int
4071 pmap_remove_range_options(
4072 	pmap_t pmap,
4073 	vm_map_address_t va,
4074 	pt_entry_t *bpte,
4075 	pt_entry_t *epte,
4076 	vm_map_address_t *eva,
4077 	bool *need_strong_sync __unused,
4078 	int options)
4079 {
4080 	pt_entry_t     *cpte;
4081 	size_t          npages = 0;
4082 	int             num_removed, num_unwired;
4083 	int             num_pte_changed;
4084 	unsigned int    pai = 0;
4085 	pmap_paddr_t    pa;
4086 	int             num_external, num_internal, num_reusable;
4087 	int             num_alt_internal;
4088 	uint64_t        num_compressed, num_alt_compressed;
4089 	int16_t         refcnt = 0;
4090 
4091 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4092 
4093 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4094 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4095 
4096 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4097 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4098 	}
4099 
4100 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4101 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4102 	}
4103 
4104 	if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4105 		panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4106 	}
4107 
4108 	num_removed = 0;
4109 	num_unwired = 0;
4110 	num_pte_changed = 0;
4111 	num_external = 0;
4112 	num_internal = 0;
4113 	num_reusable = 0;
4114 	num_compressed = 0;
4115 	num_alt_internal = 0;
4116 	num_alt_compressed = 0;
4117 
4118 #if XNU_MONITOR
4119 	bool ro_va = false;
4120 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4121 		ro_va = true;
4122 	}
4123 #endif
4124 	for (cpte = bpte; cpte < epte;
4125 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4126 		pt_entry_t      spte;
4127 		boolean_t       managed = FALSE;
4128 
4129 		/*
4130 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4131 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4132 		 */
4133 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4134 			*eva = va;
4135 			break;
4136 		}
4137 
4138 		spte = *((volatile pt_entry_t*)cpte);
4139 
4140 		while (!managed) {
4141 			if (pmap != kernel_pmap &&
4142 			    (options & PMAP_OPTIONS_REMOVE) &&
4143 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4144 				/*
4145 				 * "pmap" must be locked at this point,
4146 				 * so this should not race with another
4147 				 * pmap_remove_range() or pmap_enter().
4148 				 */
4149 
4150 				/* one less "compressed"... */
4151 				num_compressed++;
4152 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4153 					/* ... but it used to be "ALTACCT" */
4154 					num_alt_compressed++;
4155 				}
4156 
4157 				/* clear marker */
4158 				write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4159 				/*
4160 				 * "refcnt" also accounts for
4161 				 * our "compressed" markers,
4162 				 * so let's update it here.
4163 				 */
4164 				--refcnt;
4165 				spte = *((volatile pt_entry_t*)cpte);
4166 			}
4167 			/*
4168 			 * It may be possible for the pte to transition from managed
4169 			 * to unmanaged in this timeframe; for now, elide the assert.
4170 			 * We should break out as a consequence of checking pa_valid.
4171 			 */
4172 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4173 			pa = pte_to_pa(spte);
4174 			if (!pa_valid(pa)) {
4175 #if XNU_MONITOR
4176 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4177 #endif
4178 #if XNU_MONITOR
4179 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4180 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4181 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4182 					    __func__, (uint64_t)pa);
4183 				}
4184 #endif
4185 				break;
4186 			}
4187 #if HAS_FEAT_XS
4188 			if (pte_is_xs(pt_attr, spte)) {
4189 				*need_strong_sync = true;
4190 			}
4191 #endif /* HAS_FEAT_XS */
4192 			pai = pa_index(pa);
4193 			pvh_lock(pai);
4194 			spte = *((volatile pt_entry_t*)cpte);
4195 			pa = pte_to_pa(spte);
4196 			if (pai == pa_index(pa)) {
4197 				managed = TRUE;
4198 				break; // Leave pai locked as we will unlock it after we free the PV entry
4199 			}
4200 			pvh_unlock(pai);
4201 		}
4202 
4203 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4204 			/*
4205 			 * There used to be a valid mapping here but it
4206 			 * has already been removed when the page was
4207 			 * sent to the VM compressor, so nothing left to
4208 			 * remove now...
4209 			 */
4210 			continue;
4211 		}
4212 
4213 		/* remove the translation, do not flush the TLB */
4214 		if (*cpte != ARM_PTE_TYPE_FAULT) {
4215 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4216 			assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4217 #if MACH_ASSERT
4218 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4219 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4220 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4221 			}
4222 #endif
4223 			write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4224 			num_pte_changed++;
4225 		}
4226 
4227 		if ((spte != ARM_PTE_TYPE_FAULT) &&
4228 		    (pmap != kernel_pmap)) {
4229 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4230 			assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4231 			--refcnt;
4232 		}
4233 
4234 		if (pte_is_wired(spte)) {
4235 			pte_set_wired(pmap, cpte, 0);
4236 			num_unwired++;
4237 		}
4238 		/*
4239 		 * if not managed, we're done
4240 		 */
4241 		if (!managed) {
4242 			continue;
4243 		}
4244 
4245 #if XNU_MONITOR
4246 		if (__improbable(pmap == kernel_pmap && ppattr_pa_test_no_monitor(pa))) {
4247 			panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)pa);
4248 		}
4249 		if (__improbable(ro_va)) {
4250 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4251 		}
4252 #endif
4253 
4254 		/*
4255 		 * find and remove the mapping from the chain for this
4256 		 * physical address.
4257 		 */
4258 		bool is_internal, is_altacct;
4259 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4260 
4261 		if (is_altacct) {
4262 			assert(is_internal);
4263 			num_internal++;
4264 			num_alt_internal++;
4265 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4266 				ppattr_clear_altacct(pai);
4267 				ppattr_clear_internal(pai);
4268 			}
4269 		} else if (is_internal) {
4270 			if (ppattr_test_reusable(pai)) {
4271 				num_reusable++;
4272 			} else {
4273 				num_internal++;
4274 			}
4275 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4276 				ppattr_clear_internal(pai);
4277 			}
4278 		} else {
4279 			num_external++;
4280 		}
4281 		pvh_unlock(pai);
4282 		num_removed++;
4283 	}
4284 
4285 	/*
4286 	 *	Update the counts
4287 	 */
4288 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4289 
4290 	if (pmap != kernel_pmap) {
4291 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4292 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4293 		}
4294 
4295 		/* update ledgers */
4296 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4297 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4298 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4299 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4300 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4301 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4302 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4303 		/* make needed adjustments to phys_footprint */
4304 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4305 		    ((num_internal -
4306 		    num_alt_internal) +
4307 		    (num_compressed -
4308 		    num_alt_compressed)) * pmap_page_size);
4309 	}
4310 
4311 	/* flush the ptable entries we have written */
4312 	if (num_pte_changed > 0) {
4313 		FLUSH_PTE_STRONG();
4314 	}
4315 
4316 	return num_pte_changed;
4317 }
4318 
4319 
4320 /*
4321  *	Remove the given range of addresses
4322  *	from the specified map.
4323  *
4324  *	It is assumed that the start and end are properly
4325  *	rounded to the hardware page size.
4326  */
4327 void
4328 pmap_remove(
4329 	pmap_t pmap,
4330 	vm_map_address_t start,
4331 	vm_map_address_t end)
4332 {
4333 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4334 }
4335 
4336 MARK_AS_PMAP_TEXT vm_map_address_t
4337 pmap_remove_options_internal(
4338 	pmap_t pmap,
4339 	vm_map_address_t start,
4340 	vm_map_address_t end,
4341 	int options)
4342 {
4343 	vm_map_address_t eva = end;
4344 	pt_entry_t     *bpte, *epte;
4345 	pt_entry_t     *pte_p;
4346 	tt_entry_t     *tte_p;
4347 	int             remove_count = 0;
4348 	bool            need_strong_sync = false;
4349 	bool            unlock = true;
4350 
4351 	validate_pmap_mutable(pmap);
4352 
4353 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4354 
4355 	if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4356 		panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4357 	}
4358 
4359 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4360 
4361 	tte_p = pmap_tte(pmap, start);
4362 
4363 	if (tte_p == (tt_entry_t *) NULL) {
4364 		goto done;
4365 	}
4366 
4367 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4368 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4369 		bpte = &pte_p[pte_index(pt_attr, start)];
4370 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4371 
4372 		/*
4373 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4374 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4375 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4376 		 */
4377 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4378 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4379 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4380 		}
4381 
4382 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4383 		    &need_strong_sync, options);
4384 
4385 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4386 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4387 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4388 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4389 		}
4390 	}
4391 
4392 done:
4393 	if (unlock) {
4394 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4395 	}
4396 
4397 	if (remove_count > 0) {
4398 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4399 	}
4400 	return eva;
4401 }
4402 
4403 void
4404 pmap_remove_options(
4405 	pmap_t pmap,
4406 	vm_map_address_t start,
4407 	vm_map_address_t end,
4408 	int options)
4409 {
4410 	vm_map_address_t va;
4411 
4412 	if (pmap == PMAP_NULL) {
4413 		return;
4414 	}
4415 
4416 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4417 
4418 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4419 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4420 	    VM_KERNEL_ADDRHIDE(end));
4421 
4422 	/*
4423 	 * We allow single-page requests to execute non-preemptibly,
4424 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4425 	 * operation, and there are a couple of special use cases that
4426 	 * require a non-preemptible single-page operation.
4427 	 */
4428 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4429 		pmap_verify_preemptible();
4430 	}
4431 
4432 	/*
4433 	 *      Invalidate the translation buffer first
4434 	 */
4435 	va = start;
4436 	while (va < end) {
4437 		vm_map_address_t l;
4438 
4439 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4440 		if (l > end) {
4441 			l = end;
4442 		}
4443 
4444 #if XNU_MONITOR
4445 		va = pmap_remove_options_ppl(pmap, va, l, options);
4446 
4447 		pmap_ledger_check_balance(pmap);
4448 #else
4449 		va = pmap_remove_options_internal(pmap, va, l, options);
4450 #endif
4451 	}
4452 
4453 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4454 }
4455 
4456 
4457 /*
4458  *	Remove phys addr if mapped in specified map
4459  */
4460 void
4461 pmap_remove_some_phys(
4462 	__unused pmap_t map,
4463 	__unused ppnum_t pn)
4464 {
4465 	/* Implement to support working set code */
4466 }
4467 
4468 /*
4469  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4470  * switch a thread onto a new vm_map.
4471  */
4472 void
4473 pmap_switch_user(thread_t thread, vm_map_t new_map)
4474 {
4475 	pmap_t new_pmap = new_map->pmap;
4476 
4477 
4478 	thread->map = new_map;
4479 	pmap_set_pmap(new_pmap, thread);
4480 
4481 }
4482 
4483 void
4484 pmap_set_pmap(
4485 	pmap_t pmap,
4486 #if     !__ARM_USER_PROTECT__
4487 	__unused
4488 #endif
4489 	thread_t        thread)
4490 {
4491 	pmap_switch(pmap);
4492 #if __ARM_USER_PROTECT__
4493 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4494 	thread->machine.asid = pmap->hw_asid;
4495 #endif
4496 }
4497 
4498 static void
4499 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4500 {
4501 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4502 }
4503 
4504 static inline bool
4505 pmap_user_ttb_is_clear(void)
4506 {
4507 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4508 }
4509 
4510 MARK_AS_PMAP_TEXT void
4511 pmap_switch_internal(
4512 	pmap_t pmap)
4513 {
4514 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4515 #if XNU_MONITOR
4516 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4517 
4518 	/**
4519 	 * Make sure a pmap is never active-and-nested. For more details,
4520 	 * see pmap_set_nested_internal().
4521 	 */
4522 	os_atomic_thread_fence(seq_cst);
4523 	if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4524 		panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4525 	}
4526 #endif
4527 	validate_pmap_mutable(pmap);
4528 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4529 	uint16_t asid_index = pmap->hw_asid;
4530 	bool do_asid_flush = false;
4531 	bool do_commpage_flush = false;
4532 
4533 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4534 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4535 	}
4536 #if __ARM_KERNEL_PROTECT__
4537 	asid_index >>= 1;
4538 #endif
4539 
4540 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4541 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4542 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4543 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4544 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4545 	bool break_before_make = do_shared_region_flush;
4546 
4547 #if !HAS_16BIT_ASID
4548 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4549 		asid_index -= 1;
4550 		pmap_update_plru(asid_index);
4551 
4552 		/* Paranoia. */
4553 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4554 
4555 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4556 		uint8_t new_sw_asid = pmap->sw_asid;
4557 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4558 
4559 		if (new_sw_asid != last_sw_asid) {
4560 			/*
4561 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4562 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4563 			 * then this switch runs the risk of aliasing.  We need to flush the
4564 			 * TLB for this phyiscal ASID in this case.
4565 			 */
4566 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4567 			do_asid_flush = true;
4568 			break_before_make = true;
4569 		}
4570 	}
4571 #endif /* !HAS_16BIT_ASID */
4572 
4573 #if __ARM_MIXED_PAGE_SIZE__
4574 	if (pt_attr->pta_tcr_value != get_tcr()) {
4575 		break_before_make = true;
4576 	}
4577 #endif
4578 #if __ARM_MIXED_PAGE_SIZE__
4579 	/*
4580 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4581 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4582 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4583 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4584 	 * conflict abort or other unpredictable behavior.
4585 	 */
4586 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4587 		do_commpage_flush = true;
4588 	}
4589 	if (do_commpage_flush) {
4590 		break_before_make = true;
4591 	}
4592 #endif
4593 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4594 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4595 		pmap_clear_user_ttb_internal();
4596 	}
4597 
4598 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4599 	 * to flush the userspace mappings for that region.  Those mappings are global
4600 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4601 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4602 	if (__improbable(do_shared_region_flush)) {
4603 #if __ARM_RANGE_TLBI__
4604 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4605 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4606 
4607 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4608 		 * There may still be non-global entries that overlap with the incoming pmap's
4609 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4610 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4611 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4612 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4613 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4614 		 * to consider additional invalidation here in the future. */
4615 		if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4616 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4617 		} else {
4618 			/*
4619 			 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4620 			 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4621 			 * have a single-page shared region anyway, not least because pmap_nest()
4622 			 * requires L2 block alignment of the address and size.
4623 			 */
4624 			do_asid_flush = false;
4625 			flush_core_tlb_async();
4626 		}
4627 #else
4628 		do_asid_flush = false;
4629 		flush_core_tlb_async();
4630 #endif // __ARM_RANGE_TLBI__
4631 	}
4632 
4633 #if __ARM_MIXED_PAGE_SIZE__
4634 	if (__improbable(do_commpage_flush)) {
4635 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4636 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4637 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4638 		flush_core_tlb_allrange_async(rtlbi_param);
4639 	}
4640 #endif
4641 	if (__improbable(do_asid_flush)) {
4642 		pmap_flush_core_tlb_asid_async(pmap);
4643 #if DEVELOPMENT || DEBUG
4644 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4645 #endif
4646 	}
4647 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4648 		sync_tlb_flush_local();
4649 	}
4650 
4651 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4652 }
4653 
4654 void
4655 pmap_switch(
4656 	pmap_t pmap)
4657 {
4658 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4659 #if XNU_MONITOR
4660 	pmap_switch_ppl(pmap);
4661 #else
4662 	pmap_switch_internal(pmap);
4663 #endif
4664 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4665 }
4666 
4667 void
4668 pmap_page_protect(
4669 	ppnum_t ppnum,
4670 	vm_prot_t prot)
4671 {
4672 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4673 }
4674 
4675 /*
4676  *	Routine:	pmap_page_protect_options
4677  *
4678  *	Function:
4679  *		Lower the permission for all mappings to a given
4680  *		page.
4681  */
4682 MARK_AS_PMAP_TEXT static void
4683 pmap_page_protect_options_with_flush_range(
4684 	ppnum_t ppnum,
4685 	vm_prot_t prot,
4686 	unsigned int options,
4687 	pmap_tlb_flush_range_t *flush_range)
4688 {
4689 	pmap_paddr_t    phys = ptoa(ppnum);
4690 	pv_entry_t    **pv_h;
4691 	pv_entry_t     *pve_p, *orig_pve_p;
4692 	pv_entry_t     *pveh_p;
4693 	pv_entry_t     *pvet_p;
4694 	pt_entry_t     *pte_p, *orig_pte_p;
4695 	pv_entry_t     *new_pve_p;
4696 	pt_entry_t     *new_pte_p;
4697 	vm_offset_t     pvh_flags;
4698 	unsigned int    pai;
4699 	bool            remove;
4700 	bool            set_NX;
4701 	unsigned int    pvh_cnt = 0;
4702 	unsigned int    pass1_updated = 0;
4703 	unsigned int    pass2_updated = 0;
4704 
4705 	assert(ppnum != vm_page_fictitious_addr);
4706 
4707 	/* Only work with managed pages. */
4708 	if (!pa_valid(phys)) {
4709 		return;
4710 	}
4711 
4712 	/*
4713 	 * Determine the new protection.
4714 	 */
4715 	switch (prot) {
4716 	case VM_PROT_ALL:
4717 		return;         /* nothing to do */
4718 	case VM_PROT_READ:
4719 	case VM_PROT_READ | VM_PROT_EXECUTE:
4720 		remove = false;
4721 		break;
4722 	default:
4723 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4724 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4725 		remove = true;
4726 		break;
4727 	}
4728 
4729 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4730 	if (remove) {
4731 #if !XNU_MONITOR
4732 		mp_disable_preemption();
4733 #endif
4734 		pmap_cpu_data = pmap_get_cpu_data();
4735 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4736 		/*
4737 		 * Ensure the store to inflight_disconnect will be observed before any of the
4738 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4739 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4740 		 * another CPU, in between this function's clearing a PTE and dropping the
4741 		 * corresponding pagetable refcount.  That can lead to a panic if the
4742 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4743 		 * store barrier; a store-release operation would not be sufficient.
4744 		 */
4745 		os_atomic_thread_fence(release);
4746 	}
4747 
4748 	pai = pa_index(phys);
4749 	pvh_lock(pai);
4750 	pv_h = pai_to_pvh(pai);
4751 	pvh_flags = pvh_get_flags(pv_h);
4752 
4753 #if XNU_MONITOR
4754 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4755 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4756 	}
4757 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4758 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4759 	}
4760 	if (__improbable(remove && ppattr_pa_test_no_monitor(phys))) {
4761 		panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)phys);
4762 	}
4763 #endif
4764 
4765 
4766 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4767 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4768 	pveh_p = PV_ENTRY_NULL;
4769 	pvet_p = PV_ENTRY_NULL;
4770 	new_pve_p = PV_ENTRY_NULL;
4771 	new_pte_p = PT_ENTRY_NULL;
4772 
4773 
4774 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4775 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4776 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4777 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4778 		pveh_p = pve_p;
4779 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4780 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4781 	}
4782 
4783 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4784 	int pve_ptep_idx = 0;
4785 
4786 	/*
4787 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4788 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4789 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4790 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4791 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4792 	 * tlb_flush_needed to be true while issue_tlbi is false.
4793 	 */
4794 	bool issue_tlbi = false;
4795 	bool tlb_flush_needed = false;
4796 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4797 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4798 		pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4799 		bool update = false;
4800 
4801 		if (pve_p != PV_ENTRY_NULL) {
4802 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4803 			if (pte_p == PT_ENTRY_NULL) {
4804 				goto protect_skip_pve_pass1;
4805 			}
4806 		}
4807 
4808 #ifdef PVH_FLAG_IOMMU
4809 		if (pvh_ptep_is_iommu(pte_p)) {
4810 #if XNU_MONITOR
4811 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4812 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4813 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4814 			}
4815 #endif
4816 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4817 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4818 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4819 			}
4820 			goto protect_skip_pve_pass1;
4821 		}
4822 #endif
4823 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4824 		const pmap_t pmap = ptdp->pmap;
4825 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4826 
4827 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4828 #if MACH_ASSERT
4829 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4830 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4831 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4832 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4833 
4834 				pv_entry_t *check_pvep = pve_p;
4835 
4836 				do {
4837 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4838 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4839 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4840 					}
4841 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4842 
4843 				/* Restore previous PTEP value. */
4844 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4845 			}
4846 #endif
4847 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4848 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4849 		}
4850 
4851 #if DEVELOPMENT || DEBUG
4852 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4853 #else
4854 		if ((prot & VM_PROT_EXECUTE))
4855 #endif
4856 		{
4857 			set_NX = false;
4858 		} else {
4859 			set_NX = true;
4860 		}
4861 
4862 #if HAS_FEAT_XS
4863 		/**
4864 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4865 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4866 		 */
4867 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4868 #endif /* HAS_FEAT_XS */
4869 
4870 		/* Remove the mapping if new protection is NONE */
4871 		if (remove) {
4872 			if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4873 				panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
4874 				    __func__, pmap, ppnum);
4875 			}
4876 
4877 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4878 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4879 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4880 			pt_entry_t spte = *pte_p;
4881 
4882 			if (pte_is_wired(spte)) {
4883 				pte_set_wired(pmap, pte_p, 0);
4884 				spte = *pte_p;
4885 				if (pmap != kernel_pmap) {
4886 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4887 				}
4888 			}
4889 
4890 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4891 			    (uint64_t)spte, pte_p, ppnum);
4892 
4893 			if (compress && is_internal && (pmap != kernel_pmap)) {
4894 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4895 				/* mark this PTE as having been "compressed" */
4896 				tmplate = ARM_PTE_COMPRESSED;
4897 				if (is_altacct) {
4898 					tmplate |= ARM_PTE_COMPRESSED_ALT;
4899 				}
4900 			} else {
4901 				tmplate = ARM_PTE_TYPE_FAULT;
4902 			}
4903 
4904 			assert(spte != tmplate);
4905 			write_pte_fast(pte_p, tmplate);
4906 			update = true;
4907 			++pass1_updated;
4908 
4909 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4910 
4911 			if (pmap != kernel_pmap) {
4912 				if (ppattr_test_reusable(pai) &&
4913 				    is_internal &&
4914 				    !is_altacct) {
4915 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4916 				} else if (!is_internal) {
4917 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4918 				}
4919 
4920 				if (is_altacct) {
4921 					assert(is_internal);
4922 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4923 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4924 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4925 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4926 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4927 					}
4928 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4929 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4930 				} else if (ppattr_test_reusable(pai)) {
4931 					assert(is_internal);
4932 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4933 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4934 						/* was not in footprint, but is now */
4935 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4936 					}
4937 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4938 				} else if (is_internal) {
4939 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4940 
4941 					/*
4942 					 * Update all stats related to physical footprint, which only
4943 					 * deals with internal pages.
4944 					 */
4945 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4946 						/*
4947 						 * This removal is only being done so we can send this page to
4948 						 * the compressor; therefore it mustn't affect total task footprint.
4949 						 */
4950 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4951 					} else {
4952 						/*
4953 						 * This internal page isn't going to the compressor, so adjust stats to keep
4954 						 * phys_footprint up to date.
4955 						 */
4956 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4957 					}
4958 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4959 				} else {
4960 					/* external page: no impact on ledgers */
4961 				}
4962 			}
4963 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4964 		} else {
4965 			pt_entry_t spte = *pte_p;
4966 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4967 
4968 			if (pmap == kernel_pmap) {
4969 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4970 			} else {
4971 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4972 			}
4973 
4974 			/*
4975 			 * While the naive implementation of this would serve to add execute
4976 			 * permission, this is not how the VM uses this interface, or how
4977 			 * x86_64 implements it.  So ignore requests to add execute permissions.
4978 			 */
4979 			if (set_NX) {
4980 				tmplate |= pt_attr_leaf_xn(pt_attr);
4981 			}
4982 
4983 
4984 			assert(spte != ARM_PTE_TYPE_FAULT);
4985 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4986 
4987 			if (spte != tmplate) {
4988 				/*
4989 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4990 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4991 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
4992 				 * should always be cleared by this function.
4993 				 */
4994 				pte_set_was_writeable(tmplate, true);
4995 				write_pte_fast(pte_p, tmplate);
4996 				update = true;
4997 				++pass1_updated;
4998 			} else if (pte_was_writeable(tmplate)) {
4999 				/*
5000 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5001 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
5002 				 * write access to a page, this function should always at least clear that flag for
5003 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5004 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
5005 				 * be handled through arm_fast_fault().
5006 				 */
5007 				pte_set_was_writeable(tmplate, false);
5008 				write_pte_fast(pte_p, tmplate);
5009 			}
5010 		}
5011 
5012 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5013 			tlb_flush_needed = true;
5014 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5015 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5016 				issue_tlbi = true;
5017 			}
5018 		}
5019 protect_skip_pve_pass1:
5020 		pte_p = PT_ENTRY_NULL;
5021 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5022 			pve_ptep_idx = 0;
5023 			pve_p = pve_next(pve_p);
5024 		}
5025 	}
5026 
5027 	if (tlb_flush_needed) {
5028 		FLUSH_PTE_STRONG();
5029 	}
5030 
5031 	if (!remove && !issue_tlbi) {
5032 		goto protect_finish;
5033 	}
5034 
5035 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5036 	pv_entry_t **pve_pp = pv_h;
5037 	pve_p = orig_pve_p;
5038 	pte_p = orig_pte_p;
5039 	pve_ptep_idx = 0;
5040 
5041 	/*
5042 	 * We need to keep track of whether a particular PVE list contains IOMMU
5043 	 * mappings when removing entries, because we should only remove CPU
5044 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5045 	 * it around.
5046 	 */
5047 	bool iommu_mapping_in_pve = false;
5048 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5049 		if (pve_p != PV_ENTRY_NULL) {
5050 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5051 			if (pte_p == PT_ENTRY_NULL) {
5052 				goto protect_skip_pve_pass2;
5053 			}
5054 		}
5055 
5056 #ifdef PVH_FLAG_IOMMU
5057 		if (pvh_ptep_is_iommu(pte_p)) {
5058 			iommu_mapping_in_pve = true;
5059 			if (remove && (pve_p == PV_ENTRY_NULL)) {
5060 				/*
5061 				 * We've found an IOMMU entry and it's the only entry in the PV list.
5062 				 * We don't discard IOMMU entries, so simply set up the new PV list to
5063 				 * contain the single IOMMU PTE and exit the loop.
5064 				 */
5065 				new_pte_p = pte_p;
5066 				break;
5067 			}
5068 			goto protect_skip_pve_pass2;
5069 		}
5070 #endif
5071 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5072 		const pmap_t pmap = ptdp->pmap;
5073 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5074 
5075 		if (remove) {
5076 			if (!compress && (pmap != kernel_pmap)) {
5077 				/*
5078 				 * We must wait to decrement the refcount until we're completely finished using the PTE
5079 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
5080 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5081 				 * under us.
5082 				 */
5083 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5084 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5085 				}
5086 			}
5087 			/* Remove this CPU mapping from PVE list. */
5088 			if (pve_p != PV_ENTRY_NULL) {
5089 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5090 			}
5091 		} else {
5092 			pt_entry_t spte = *pte_p;
5093 			if (pte_was_writeable(spte)) {
5094 				pte_set_was_writeable(spte, false);
5095 				write_pte_fast(pte_p, spte);
5096 			} else {
5097 				goto protect_skip_pve_pass2;
5098 			}
5099 		}
5100 		++pass2_updated;
5101 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5102 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5103 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5104 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5105 		}
5106 
5107 protect_skip_pve_pass2:
5108 		pte_p = PT_ENTRY_NULL;
5109 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5110 			pve_ptep_idx = 0;
5111 
5112 			if (remove) {
5113 				/**
5114 				 * If there are any IOMMU mappings in the PVE list, preserve
5115 				 * those mappings in a new PVE list (new_pve_p) which will later
5116 				 * become the new PVH entry. Keep track of the CPU mappings in
5117 				 * pveh_p/pvet_p so they can be deallocated later.
5118 				 */
5119 				if (iommu_mapping_in_pve) {
5120 					iommu_mapping_in_pve = false;
5121 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5122 					pve_remove(pv_h, pve_pp, pve_p);
5123 					pveh_p = pvh_pve_list(pv_h);
5124 					pve_p->pve_next = new_pve_p;
5125 					new_pve_p = pve_p;
5126 					pve_p = temp_pve_p;
5127 					continue;
5128 				} else {
5129 					pvet_p = pve_p;
5130 					pvh_cnt++;
5131 				}
5132 			}
5133 
5134 			pve_pp = pve_next_ptr(pve_p);
5135 			pve_p = pve_next(pve_p);
5136 			iommu_mapping_in_pve = false;
5137 		}
5138 	}
5139 
5140 protect_finish:
5141 
5142 #ifdef PVH_FLAG_EXEC
5143 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5144 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5145 	}
5146 #endif
5147 	if (__improbable(pass1_updated != pass2_updated)) {
5148 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5149 		    __func__, pass1_updated, pass2_updated);
5150 	}
5151 	/* if we removed a bunch of entries, take care of them now */
5152 	if (remove) {
5153 		if (new_pve_p != PV_ENTRY_NULL) {
5154 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5155 			pvh_set_flags(pv_h, pvh_flags);
5156 		} else if (new_pte_p != PT_ENTRY_NULL) {
5157 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5158 			pvh_set_flags(pv_h, pvh_flags);
5159 		} else {
5160 			if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5161 				pmap_flush_noncoherent_page(phys);
5162 			}
5163 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5164 		}
5165 	}
5166 
5167 	if (flush_range && tlb_flush_needed) {
5168 		if (!remove) {
5169 			flush_range->ptfr_flush_needed = true;
5170 			tlb_flush_needed = false;
5171 		}
5172 	}
5173 
5174 	/*
5175 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5176 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5177 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5178 	 * a page to be repurposed while it is still live in the TLBs.
5179 	 */
5180 	if (remove && tlb_flush_needed) {
5181 		sync_tlb_flush();
5182 	}
5183 
5184 
5185 	pvh_unlock(pai);
5186 
5187 	if (remove) {
5188 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5189 #if !XNU_MONITOR
5190 		mp_enable_preemption();
5191 #endif
5192 	}
5193 
5194 	if (!remove && tlb_flush_needed) {
5195 		sync_tlb_flush();
5196 	}
5197 
5198 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5199 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5200 	}
5201 }
5202 
5203 MARK_AS_PMAP_TEXT void
5204 pmap_page_protect_options_internal(
5205 	ppnum_t ppnum,
5206 	vm_prot_t prot,
5207 	unsigned int options,
5208 	void *arg)
5209 {
5210 	if (arg != NULL) {
5211 		/*
5212 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5213 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5214 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5215 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5216 		 * In that case, force the flush to take place.
5217 		 */
5218 		options &= ~PMAP_OPTIONS_NOFLUSH;
5219 	}
5220 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5221 }
5222 
5223 void
5224 pmap_page_protect_options(
5225 	ppnum_t ppnum,
5226 	vm_prot_t prot,
5227 	unsigned int options,
5228 	void *arg)
5229 {
5230 	pmap_paddr_t    phys = ptoa(ppnum);
5231 
5232 	assert(ppnum != vm_page_fictitious_addr);
5233 
5234 	/* Only work with managed pages. */
5235 	if (!pa_valid(phys)) {
5236 		return;
5237 	}
5238 
5239 	/*
5240 	 * Determine the new protection.
5241 	 */
5242 	if (prot == VM_PROT_ALL) {
5243 		return;         /* nothing to do */
5244 	}
5245 
5246 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5247 
5248 #if XNU_MONITOR
5249 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5250 #else
5251 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5252 #endif
5253 
5254 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5255 }
5256 
5257 
5258 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5259 MARK_AS_PMAP_TEXT void
5260 pmap_disable_user_jop_internal(pmap_t pmap)
5261 {
5262 	if (pmap == kernel_pmap) {
5263 		panic("%s: called with kernel_pmap", __func__);
5264 	}
5265 	validate_pmap_mutable(pmap);
5266 	pmap->disable_jop = true;
5267 }
5268 
5269 void
5270 pmap_disable_user_jop(pmap_t pmap)
5271 {
5272 #if XNU_MONITOR
5273 	pmap_disable_user_jop_ppl(pmap);
5274 #else
5275 	pmap_disable_user_jop_internal(pmap);
5276 #endif
5277 }
5278 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5279 
5280 /*
5281  * Indicates if the pmap layer enforces some additional restrictions on the
5282  * given set of protections.
5283  */
5284 bool
5285 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5286 {
5287 	return false;
5288 }
5289 
5290 /*
5291  *	Set the physical protection on the
5292  *	specified range of this map as requested.
5293  *	VERY IMPORTANT: Will not increase permissions.
5294  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5295  */
5296 void
5297 pmap_protect(
5298 	pmap_t pmap,
5299 	vm_map_address_t b,
5300 	vm_map_address_t e,
5301 	vm_prot_t prot)
5302 {
5303 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5304 }
5305 
5306 MARK_AS_PMAP_TEXT vm_map_address_t
5307 pmap_protect_options_internal(
5308 	pmap_t pmap,
5309 	vm_map_address_t start,
5310 	vm_map_address_t end,
5311 	vm_prot_t prot,
5312 	unsigned int options,
5313 	__unused void *args)
5314 {
5315 	tt_entry_t      *tte_p;
5316 	pt_entry_t      *bpte_p, *epte_p;
5317 	pt_entry_t      *pte_p;
5318 	boolean_t        set_NX = TRUE;
5319 	boolean_t        set_XO = FALSE;
5320 	boolean_t        should_have_removed = FALSE;
5321 	bool             need_strong_sync = false;
5322 
5323 	/* Validate the pmap input before accessing its data. */
5324 	validate_pmap_mutable(pmap);
5325 
5326 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5327 
5328 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5329 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5330 	}
5331 
5332 #if DEVELOPMENT || DEBUG
5333 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5334 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5335 			should_have_removed = TRUE;
5336 		}
5337 	} else
5338 #endif
5339 	{
5340 		/* Determine the new protection. */
5341 		switch (prot) {
5342 		case VM_PROT_EXECUTE:
5343 			set_XO = TRUE;
5344 			OS_FALLTHROUGH;
5345 		case VM_PROT_READ:
5346 		case VM_PROT_READ | VM_PROT_EXECUTE:
5347 			break;
5348 		case VM_PROT_READ | VM_PROT_WRITE:
5349 		case VM_PROT_ALL:
5350 			return end;         /* nothing to do */
5351 		default:
5352 			should_have_removed = TRUE;
5353 		}
5354 	}
5355 
5356 	if (should_have_removed) {
5357 		panic("%s: should have been a remove operation, "
5358 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5359 		    __FUNCTION__,
5360 		    pmap, (void *)start, (void *)end, prot, options, args);
5361 	}
5362 
5363 #if DEVELOPMENT || DEBUG
5364 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5365 #else
5366 	if ((prot & VM_PROT_EXECUTE))
5367 #endif
5368 	{
5369 		set_NX = FALSE;
5370 	} else {
5371 		set_NX = TRUE;
5372 	}
5373 
5374 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5375 	vm_map_address_t va = start;
5376 	unsigned int npages = 0;
5377 
5378 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5379 
5380 	tte_p = pmap_tte(pmap, start);
5381 
5382 	if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5383 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5384 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5385 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5386 		pte_p = bpte_p;
5387 
5388 		for (pte_p = bpte_p;
5389 		    pte_p < epte_p;
5390 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5391 			++npages;
5392 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5393 			    pmap_pending_preemption())) {
5394 				break;
5395 			}
5396 			pt_entry_t spte;
5397 #if DEVELOPMENT || DEBUG
5398 			boolean_t  force_write = FALSE;
5399 #endif
5400 
5401 			spte = *((volatile pt_entry_t*)pte_p);
5402 
5403 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5404 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5405 				continue;
5406 			}
5407 
5408 			pmap_paddr_t    pa;
5409 			unsigned int    pai = 0;
5410 			boolean_t       managed = FALSE;
5411 
5412 			while (!managed) {
5413 				/*
5414 				 * It may be possible for the pte to transition from managed
5415 				 * to unmanaged in this timeframe; for now, elide the assert.
5416 				 * We should break out as a consequence of checking pa_valid.
5417 				 */
5418 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5419 				pa = pte_to_pa(spte);
5420 				if (!pa_valid(pa)) {
5421 					break;
5422 				}
5423 				pai = pa_index(pa);
5424 				pvh_lock(pai);
5425 				spte = *((volatile pt_entry_t*)pte_p);
5426 				pa = pte_to_pa(spte);
5427 				if (pai == pa_index(pa)) {
5428 					managed = TRUE;
5429 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5430 				}
5431 				pvh_unlock(pai);
5432 			}
5433 
5434 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5435 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5436 				continue;
5437 			}
5438 
5439 			pt_entry_t      tmplate;
5440 
5441 			if (pmap == kernel_pmap) {
5442 #if DEVELOPMENT || DEBUG
5443 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5444 					force_write = TRUE;
5445 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5446 				} else
5447 #endif
5448 				{
5449 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5450 				}
5451 			} else {
5452 #if DEVELOPMENT || DEBUG
5453 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5454 					assert(pmap->type != PMAP_TYPE_NESTED);
5455 					force_write = TRUE;
5456 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5457 				} else
5458 #endif
5459 				{
5460 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5461 				}
5462 			}
5463 
5464 			/*
5465 			 * XXX Removing "NX" would
5466 			 * grant "execute" access
5467 			 * immediately, bypassing any
5468 			 * checks VM might want to do
5469 			 * in its soft fault path.
5470 			 * pmap_protect() and co. are
5471 			 * not allowed to increase
5472 			 * access permissions.
5473 			 */
5474 			if (set_NX) {
5475 				tmplate |= pt_attr_leaf_xn(pt_attr);
5476 			} else {
5477 				if (pmap == kernel_pmap) {
5478 					/* do NOT clear "PNX"! */
5479 					tmplate |= ARM_PTE_NX;
5480 				} else {
5481 					/* do NOT clear "NX"! */
5482 					tmplate |= pt_attr_leaf_x(pt_attr);
5483 					if (set_XO) {
5484 						tmplate &= ~ARM_PTE_APMASK;
5485 						tmplate |= pt_attr_leaf_rona(pt_attr);
5486 					}
5487 				}
5488 			}
5489 
5490 #if DEVELOPMENT || DEBUG
5491 			if (force_write) {
5492 				/*
5493 				 * TODO: Run CS/Monitor checks here.
5494 				 */
5495 				if (managed) {
5496 					/*
5497 					 * We are marking the page as writable,
5498 					 * so we consider it to be modified and
5499 					 * referenced.
5500 					 */
5501 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5502 					tmplate |= ARM_PTE_AF;
5503 
5504 					if (ppattr_test_reffault(pai)) {
5505 						ppattr_clear_reffault(pai);
5506 					}
5507 
5508 					if (ppattr_test_modfault(pai)) {
5509 						ppattr_clear_modfault(pai);
5510 					}
5511 				}
5512 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5513 				/*
5514 				 * An immediate request for anything other than
5515 				 * write should still mark the page as
5516 				 * referenced if managed.
5517 				 */
5518 				if (managed) {
5519 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5520 					tmplate |= ARM_PTE_AF;
5521 
5522 					if (ppattr_test_reffault(pai)) {
5523 						ppattr_clear_reffault(pai);
5524 					}
5525 				}
5526 			}
5527 #endif
5528 
5529 			/* We do not expect to write fast fault the entry. */
5530 			pte_set_was_writeable(tmplate, false);
5531 #if HAS_FEAT_XS
5532 			if (pte_is_xs(pt_attr, spte)) {
5533 				need_strong_sync = true;
5534 			}
5535 #endif /* HAS_FEAT_XS */
5536 
5537 			write_pte_fast(pte_p, tmplate);
5538 
5539 			if (managed) {
5540 				pvh_assert_locked(pai);
5541 				pvh_unlock(pai);
5542 			}
5543 		}
5544 		FLUSH_PTE_STRONG();
5545 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5546 	} else {
5547 		va = end;
5548 	}
5549 
5550 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5551 	return va;
5552 }
5553 
5554 void
5555 pmap_protect_options(
5556 	pmap_t pmap,
5557 	vm_map_address_t b,
5558 	vm_map_address_t e,
5559 	vm_prot_t prot,
5560 	unsigned int options,
5561 	__unused void *args)
5562 {
5563 	vm_map_address_t l, beg;
5564 
5565 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5566 
5567 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5568 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5569 		    pmap, (uint64_t)b, (uint64_t)e);
5570 	}
5571 
5572 	/*
5573 	 * We allow single-page requests to execute non-preemptibly,
5574 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5575 	 * operation, and there are a couple of special use cases that
5576 	 * require a non-preemptible single-page operation.
5577 	 */
5578 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5579 		pmap_verify_preemptible();
5580 	}
5581 
5582 #if DEVELOPMENT || DEBUG
5583 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5584 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5585 			pmap_remove_options(pmap, b, e, options);
5586 			return;
5587 		}
5588 	} else
5589 #endif
5590 	{
5591 		/* Determine the new protection. */
5592 		switch (prot) {
5593 		case VM_PROT_EXECUTE:
5594 		case VM_PROT_READ:
5595 		case VM_PROT_READ | VM_PROT_EXECUTE:
5596 			break;
5597 		case VM_PROT_READ | VM_PROT_WRITE:
5598 		case VM_PROT_ALL:
5599 			return;         /* nothing to do */
5600 		default:
5601 			pmap_remove_options(pmap, b, e, options);
5602 			return;
5603 		}
5604 	}
5605 
5606 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5607 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5608 	    VM_KERNEL_ADDRHIDE(e));
5609 
5610 	beg = b;
5611 
5612 	while (beg < e) {
5613 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5614 
5615 		if (l > e) {
5616 			l = e;
5617 		}
5618 
5619 #if XNU_MONITOR
5620 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5621 #else
5622 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5623 #endif
5624 	}
5625 
5626 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5627 }
5628 
5629 /**
5630  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5631  *
5632  * @param pmap pmap to insert the pages into.
5633  * @param va virtual address to map the pages into.
5634  * @param pa page number of the first physical page to map.
5635  * @param size block size, in number of pages.
5636  * @param prot mapping protection attributes.
5637  * @param attr flags to pass to pmap_enter().
5638  *
5639  * @return KERN_SUCCESS.
5640  */
5641 kern_return_t
5642 pmap_map_block(
5643 	pmap_t pmap,
5644 	addr64_t va,
5645 	ppnum_t pa,
5646 	uint32_t size,
5647 	vm_prot_t prot,
5648 	int attr,
5649 	unsigned int flags)
5650 {
5651 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5652 }
5653 
5654 /**
5655  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5656  * As opposed to pmap_map_block(), this function takes
5657  * a physical address as an input and operates using the
5658  * page size associated with the input pmap.
5659  *
5660  * @param pmap pmap to insert the pages into.
5661  * @param va virtual address to map the pages into.
5662  * @param pa physical address of the first physical page to map.
5663  * @param size block size, in number of pages.
5664  * @param prot mapping protection attributes.
5665  * @param attr flags to pass to pmap_enter().
5666  *
5667  * @return KERN_SUCCESS.
5668  */
5669 kern_return_t
5670 pmap_map_block_addr(
5671 	pmap_t pmap,
5672 	addr64_t va,
5673 	pmap_paddr_t pa,
5674 	uint32_t size,
5675 	vm_prot_t prot,
5676 	int attr,
5677 	unsigned int flags)
5678 {
5679 #if __ARM_MIXED_PAGE_SIZE__
5680 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5681 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5682 #else
5683 	const uint64_t pmap_page_size = PAGE_SIZE;
5684 #endif
5685 
5686 	for (ppnum_t page = 0; page < size; page++) {
5687 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5688 			panic("%s: failed pmap_enter_addr, "
5689 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5690 			    __FUNCTION__,
5691 			    pmap, va, (uint64_t)pa, size, prot, flags);
5692 		}
5693 
5694 		va += pmap_page_size;
5695 		pa += pmap_page_size;
5696 	}
5697 
5698 	return KERN_SUCCESS;
5699 }
5700 
5701 kern_return_t
5702 pmap_enter_addr(
5703 	pmap_t pmap,
5704 	vm_map_address_t v,
5705 	pmap_paddr_t pa,
5706 	vm_prot_t prot,
5707 	vm_prot_t fault_type,
5708 	unsigned int flags,
5709 	boolean_t wired)
5710 {
5711 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5712 }
5713 
5714 /*
5715  *	Insert the given physical page (p) at
5716  *	the specified virtual address (v) in the
5717  *	target physical map with the protection requested.
5718  *
5719  *	If specified, the page will be wired down, meaning
5720  *	that the related pte can not be reclaimed.
5721  *
5722  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5723  *	or lose information.  That is, this routine must actually
5724  *	insert this page into the given map eventually (must make
5725  *	forward progress eventually.
5726  */
5727 kern_return_t
5728 pmap_enter(
5729 	pmap_t pmap,
5730 	vm_map_address_t v,
5731 	ppnum_t pn,
5732 	vm_prot_t prot,
5733 	vm_prot_t fault_type,
5734 	unsigned int flags,
5735 	boolean_t wired,
5736 	__unused pmap_mapping_type_t mapping_type)
5737 {
5738 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5739 }
5740 
5741 /*
5742  * Attempt to commit the pte.
5743  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5744  * Performs no page table or accounting writes on failures.
5745  */
5746 static inline bool
5747 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5748 {
5749 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5750 	bool success = false, changed_wiring = false;
5751 
5752 	__unreachable_ok_push
5753 	if (TEST_PAGE_RATIO_4) {
5754 		/*
5755 		 * 16K virtual pages w/ 4K hw pages.
5756 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5757 		 * As a result we require the exclusive pmap lock.
5758 		 */
5759 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5760 		*old_pte = *pte_p;
5761 		if (*old_pte == new_pte) {
5762 			/* Another thread completed this operation. Nothing to do here. */
5763 			success = true;
5764 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5765 		    (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5766 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5767 			success = false;
5768 		} else {
5769 			write_pte_fast(pte_p, new_pte);
5770 			success = true;
5771 		}
5772 	} else {
5773 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5774 	}
5775 	__unreachable_ok_pop
5776 
5777 	if (success && *old_pte != new_pte) {
5778 		if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5779 			bool need_strong_sync = false;
5780 			FLUSH_PTE_STRONG();
5781 #if HAS_FEAT_XS
5782 			if (pte_is_xs(pt_attr, *old_pte)) {
5783 				need_strong_sync = true;
5784 			}
5785 #endif /* HAS_FEAT_XS */
5786 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5787 		} else {
5788 			FLUSH_PTE();
5789 			__builtin_arm_isb(ISB_SY);
5790 		}
5791 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5792 		    (new_pte & ARM_PTE_WIRED) != 0 :
5793 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5794 
5795 		if (pmap != kernel_pmap && changed_wiring) {
5796 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5797 			if (new_pte & ARM_PTE_WIRED) {
5798 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5799 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5800 			} else {
5801 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5802 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5803 			}
5804 		}
5805 
5806 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5807 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5808 	}
5809 	return success;
5810 }
5811 
5812 MARK_AS_PMAP_TEXT static pt_entry_t
5813 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5814 {
5815 	pt_entry_t pte;
5816 
5817 	switch (wimg & (VM_WIMG_MASK)) {
5818 	case VM_WIMG_IO:
5819 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5820 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5821 		// AP, while preserving the security benefits of using device
5822 		// mapping against side-channel attacks. On pre-H14 platforms,
5823 		// the accesses will still be strongly ordered.
5824 		if (is_dram_addr(pa)) {
5825 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5826 		} else {
5827 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5828 		}
5829 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5830 		break;
5831 	case VM_WIMG_RT:
5832 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5833 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5834 		break;
5835 	case VM_WIMG_POSTED:
5836 		if (is_dram_addr(pa)) {
5837 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5838 		} else {
5839 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5840 		}
5841 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5842 		break;
5843 	case VM_WIMG_POSTED_REORDERED:
5844 		if (is_dram_addr(pa)) {
5845 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5846 		} else {
5847 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5848 		}
5849 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5850 		break;
5851 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5852 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5853 #if HAS_FEAT_XS
5854 		if (!is_dram_addr(pa)) {
5855 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5856 		}
5857 #endif /* HAS_FEAT_XS */
5858 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5859 		break;
5860 	case VM_WIMG_WCOMB:
5861 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5862 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5863 		break;
5864 	case VM_WIMG_WTHRU:
5865 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5866 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5867 		break;
5868 	case VM_WIMG_COPYBACK:
5869 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5870 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5871 		break;
5872 	case VM_WIMG_INNERWBACK:
5873 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5874 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5875 		break;
5876 	default:
5877 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5878 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5879 	}
5880 
5881 	return pte;
5882 }
5883 
5884 
5885 /*
5886  * Construct a PTE (and the physical page attributes) for the given virtual to
5887  * physical mapping.
5888  *
5889  * This function has no side effects and is safe to call so that it is safe to
5890  * call while attempting a pmap_enter transaction.
5891  */
5892 MARK_AS_PMAP_TEXT static pt_entry_t
5893 pmap_construct_pte(
5894 	const pmap_t pmap,
5895 	vm_map_address_t va,
5896 	pmap_paddr_t pa,
5897 	vm_prot_t prot,
5898 	vm_prot_t fault_type,
5899 	boolean_t wired,
5900 	const pt_attr_t* const pt_attr,
5901 	uint16_t *pp_attr_bits /* OUTPUT */
5902 	)
5903 {
5904 	bool set_NX = false, set_XO = false;
5905 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5906 	assert(pp_attr_bits != NULL);
5907 	*pp_attr_bits = 0;
5908 
5909 	if (wired) {
5910 		pte |= ARM_PTE_WIRED;
5911 	}
5912 
5913 #if DEVELOPMENT || DEBUG
5914 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5915 #else
5916 	if ((prot & VM_PROT_EXECUTE))
5917 #endif
5918 	{
5919 		set_NX = false;
5920 	} else {
5921 		set_NX = true;
5922 	}
5923 
5924 	if (prot == VM_PROT_EXECUTE) {
5925 		set_XO = true;
5926 	}
5927 
5928 	if (set_NX) {
5929 		pte |= pt_attr_leaf_xn(pt_attr);
5930 	} else {
5931 		if (pmap == kernel_pmap) {
5932 			pte |= ARM_PTE_NX;
5933 		} else {
5934 			pte |= pt_attr_leaf_x(pt_attr);
5935 		}
5936 	}
5937 
5938 	if (pmap == kernel_pmap) {
5939 #if __ARM_KERNEL_PROTECT__
5940 		pte |= ARM_PTE_NG;
5941 #endif /* __ARM_KERNEL_PROTECT__ */
5942 		if (prot & VM_PROT_WRITE) {
5943 			pte |= ARM_PTE_AP(AP_RWNA);
5944 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5945 		} else {
5946 			pte |= ARM_PTE_AP(AP_RONA);
5947 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5948 		}
5949 	} else {
5950 		if (pmap->type != PMAP_TYPE_NESTED) {
5951 			pte |= ARM_PTE_NG;
5952 		} else if ((pmap->nested_region_unnested_table_bitmap)
5953 		    && (va >= pmap->nested_region_addr)
5954 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5955 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
5956 
5957 			if ((pmap->nested_region_unnested_table_bitmap)
5958 			    && testbit(index, (int *)pmap->nested_region_unnested_table_bitmap)) {
5959 				pte |= ARM_PTE_NG;
5960 			}
5961 		}
5962 		if (prot & VM_PROT_WRITE) {
5963 			assert(pmap->type != PMAP_TYPE_NESTED);
5964 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5965 				if (fault_type & VM_PROT_WRITE) {
5966 					pte |= pt_attr_leaf_rw(pt_attr);
5967 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5968 				} else {
5969 					pte |= pt_attr_leaf_ro(pt_attr);
5970 					/*
5971 					 * Mark the page as MODFAULT so that a subsequent write
5972 					 * may be handled through arm_fast_fault().
5973 					 */
5974 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5975 					pte_set_was_writeable(pte, true);
5976 				}
5977 			} else {
5978 				pte |= pt_attr_leaf_rw(pt_attr);
5979 				*pp_attr_bits |= PP_ATTR_REFERENCED;
5980 			}
5981 		} else {
5982 			if (set_XO) {
5983 				pte |= pt_attr_leaf_rona(pt_attr);
5984 			} else {
5985 				pte |= pt_attr_leaf_ro(pt_attr);
5986 			}
5987 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5988 		}
5989 	}
5990 
5991 	pte |= ARM_PTE_AF;
5992 	return pte;
5993 }
5994 
5995 MARK_AS_PMAP_TEXT kern_return_t
5996 pmap_enter_options_internal(
5997 	pmap_t pmap,
5998 	vm_map_address_t v,
5999 	pmap_paddr_t pa,
6000 	vm_prot_t prot,
6001 	vm_prot_t fault_type,
6002 	unsigned int flags,
6003 	boolean_t wired,
6004 	unsigned int options)
6005 {
6006 	ppnum_t         pn = (ppnum_t)atop(pa);
6007 	pt_entry_t      pte;
6008 	pt_entry_t      spte;
6009 	pt_entry_t      *pte_p;
6010 	bool            refcnt_updated;
6011 	bool            wiredcnt_updated;
6012 	bool            ro_va = false;
6013 	unsigned int    wimg_bits;
6014 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6015 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6016 	kern_return_t   kr = KERN_SUCCESS;
6017 	uint16_t pp_attr_bits;
6018 	volatile uint16_t *refcnt;
6019 	volatile uint16_t *wiredcnt;
6020 	pv_free_list_t *local_pv_free;
6021 
6022 	validate_pmap_mutable(pmap);
6023 
6024 #if XNU_MONITOR
6025 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6026 		panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6027 	}
6028 #endif
6029 
6030 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6031 
6032 	if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6033 		panic("%s: pmap %p v 0x%llx not page-aligned",
6034 		    __func__, pmap, (unsigned long long)v);
6035 	}
6036 
6037 	if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6038 		panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6039 	}
6040 
6041 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6042 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6043 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6044 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6045 	}
6046 
6047 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6048 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6049 		    pmap, (uint64_t)pa);
6050 	}
6051 
6052 	/* The PA should not extend beyond the architected physical address space */
6053 	pa &= ARM_PTE_PAGE_MASK;
6054 
6055 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6056 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6057 		extern vm_offset_t ctrr_test_page;
6058 		if (__probable(v != ctrr_test_page))
6059 #endif
6060 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6061 	}
6062 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6063 		if (__improbable(prot != VM_PROT_READ)) {
6064 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6065 			    __func__, (unsigned long long)v, prot);
6066 		}
6067 		ro_va = true;
6068 	}
6069 	assert(pn != vm_page_fictitious_addr);
6070 
6071 	refcnt_updated = false;
6072 	wiredcnt_updated = false;
6073 
6074 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6075 		/*
6076 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6077 		 *
6078 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6079 		 */
6080 		lock_mode = PMAP_LOCK_EXCLUSIVE;
6081 	}
6082 
6083 	if (!pmap_lock_preempt(pmap, lock_mode)) {
6084 		return KERN_ABORTED;
6085 	}
6086 
6087 	/*
6088 	 *	Expand pmap to include this pte.  Assume that
6089 	 *	pmap is always expanded to include enough hardware
6090 	 *	pages to map one VM page.
6091 	 */
6092 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6093 		/* Must unlock to expand the pmap. */
6094 		pmap_unlock(pmap, lock_mode);
6095 
6096 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6097 
6098 		if (kr != KERN_SUCCESS) {
6099 			return kr;
6100 		}
6101 
6102 		if (!pmap_lock_preempt(pmap, lock_mode)) {
6103 			return KERN_ABORTED;
6104 		}
6105 	}
6106 
6107 	if (options & PMAP_OPTIONS_NOENTER) {
6108 		pmap_unlock(pmap, lock_mode);
6109 		return KERN_SUCCESS;
6110 	}
6111 
6112 	/*
6113 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6114 	 * done via a cmpxchg loop.
6115 	 * We need to be careful about modifying non-local data structures before commiting
6116 	 * the new pte since we may need to re-do the transaction.
6117 	 */
6118 	spte = os_atomic_load(pte_p, relaxed);
6119 	while (!committed) {
6120 		refcnt = NULL;
6121 		wiredcnt = NULL;
6122 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6123 		had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6124 
6125 		if (pmap != kernel_pmap) {
6126 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
6127 			refcnt = &ptd_info->refcnt;
6128 			wiredcnt = &ptd_info->wiredcnt;
6129 			/*
6130 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6131 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6132 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6133 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6134 			 * have PTDs, so we can't use the check there.
6135 			 */
6136 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6137 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6138 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6139 			}
6140 			/*
6141 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
6142 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6143 			 * or acquire the pmap lock exclusive.
6144 			 */
6145 			if (!wiredcnt_updated) {
6146 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6147 				wiredcnt_updated = true;
6148 			}
6149 			if (!refcnt_updated) {
6150 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
6151 				refcnt_updated = true;
6152 				drop_refcnt = true;
6153 			}
6154 		}
6155 
6156 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6157 			/*
6158 			 * There is already a mapping here & it's for a different physical page.
6159 			 * First remove that mapping.
6160 			 *
6161 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6162 			 */
6163 			if (lock_mode == PMAP_LOCK_SHARED) {
6164 				if (pmap_lock_shared_to_exclusive(pmap)) {
6165 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6166 				} else {
6167 					/*
6168 					 * We failed to upgrade to an exclusive lock.
6169 					 * As a result we no longer hold the lock at all,
6170 					 * so we need to re-acquire it and restart the transaction.
6171 					 */
6172 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6173 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6174 					/* pmap might have changed after we dropped the lock. Try again. */
6175 					spte = os_atomic_load(pte_p, relaxed);
6176 					continue;
6177 				}
6178 			}
6179 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6180 			spte = ARM_PTE_TYPE_FAULT;
6181 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6182 		}
6183 
6184 		/*
6185 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6186 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6187 		 * read-write protection. The PMAP layer though still needs to use the right
6188 		 * index, which is the older XO-now-TPRO one and that is specially selected
6189 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6190 		 */
6191 		if (options & PMAP_OPTIONS_MAP_TPRO) {
6192 			if (__improbable(pmap == kernel_pmap)) {
6193 				panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6194 				    __func__);
6195 			}
6196 			pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6197 		} else {
6198 			pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6199 		}
6200 
6201 		if (pa_valid(pa)) {
6202 			unsigned int pai;
6203 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6204 
6205 			is_internal = FALSE;
6206 			is_altacct = FALSE;
6207 
6208 			pai = pa_index(pa);
6209 
6210 			pvh_lock(pai);
6211 
6212 			/*
6213 			 * Make sure that the current per-cpu PV free list has
6214 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6215 			 * if the transaction succeeds. We're either in the
6216 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6217 			 * Note that we can still be interrupted, but a primary
6218 			 * interrupt handler can never enter the pmap.
6219 			 */
6220 #if !XNU_MONITOR
6221 			assert(get_preemption_level() > 0);
6222 #endif
6223 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6224 			pv_entry_t **pv_h = pai_to_pvh(pai);
6225 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6226 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6227 
6228 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6229 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6230 				int new_allocated_pves = 0;
6231 
6232 				while (new_allocated_pves < 2) {
6233 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6234 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6235 					if (pv_status == PV_ALLOC_FAIL) {
6236 						break;
6237 					} else if (pv_status == PV_ALLOC_RETRY) {
6238 						/*
6239 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6240 						 * it will have dropped the pmap lock while doing so.
6241 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6242 						 * be on a different CPU now.
6243 						 */
6244 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6245 					} else {
6246 						/* If we've gotten this far then a node should've been allocated. */
6247 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6248 
6249 						new_allocated_pves++;
6250 					}
6251 				}
6252 
6253 				for (int i = 0; i < new_allocated_pves; i++) {
6254 					pv_free(new_pve_p[i]);
6255 				}
6256 			}
6257 
6258 			if (pv_status == PV_ALLOC_FAIL) {
6259 				pvh_unlock(pai);
6260 				kr = KERN_RESOURCE_SHORTAGE;
6261 				break;
6262 			} else if (pv_status == PV_ALLOC_RETRY) {
6263 				pvh_unlock(pai);
6264 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6265 				spte = os_atomic_load(pte_p, relaxed);
6266 				continue;
6267 			}
6268 
6269 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6270 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6271 			} else {
6272 				wimg_bits = pmap_cache_attributes(pn);
6273 			}
6274 
6275 			/* We may be retrying this operation after dropping the PVH lock.
6276 			 * Cache attributes for the physical page may have changed while the lock
6277 			 * was dropped, so clear any cache attributes we may have previously set
6278 			 * in the PTE template. */
6279 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6280 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6281 
6282 #if XNU_MONITOR
6283 			/* The regular old kernel is not allowed to remap PPL pages. */
6284 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6285 				panic("%s: page belongs to PPL, "
6286 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6287 				    __FUNCTION__,
6288 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6289 			}
6290 
6291 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6292 				panic("%s: page locked down, "
6293 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6294 				    __FUNCTION__,
6295 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6296 			}
6297 #endif
6298 
6299 
6300 
6301 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6302 			if (!committed) {
6303 				pvh_unlock(pai);
6304 				continue;
6305 			}
6306 			had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6307 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6308 
6309 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6310 			/*
6311 			 * If there was already a valid pte here then we reuse its reference
6312 			 * on the ptd and drop the one that we took above.
6313 			 */
6314 			drop_refcnt = had_valid_mapping;
6315 
6316 			if (!had_valid_mapping) {
6317 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6318 				int pve_ptep_idx = 0;
6319 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6320 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6321 				if (pv_status != PV_ALLOC_SUCCESS) {
6322 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6323 					    __func__, pv_status, new_pve_p, pmap);
6324 				}
6325 
6326 				if (pmap != kernel_pmap) {
6327 					if (options & PMAP_OPTIONS_INTERNAL) {
6328 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6329 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6330 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6331 							/*
6332 							 * Make a note to ourselves that this
6333 							 * mapping is using alternative
6334 							 * accounting. We'll need this in order
6335 							 * to know which ledger to debit when
6336 							 * the mapping is removed.
6337 							 *
6338 							 * The altacct bit must be set while
6339 							 * the pv head is locked. Defer the
6340 							 * ledger accounting until after we've
6341 							 * dropped the lock.
6342 							 */
6343 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6344 							is_altacct = TRUE;
6345 						}
6346 					}
6347 					if (ppattr_test_reusable(pai) &&
6348 					    !is_altacct) {
6349 						is_reusable = TRUE;
6350 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6351 						is_internal = TRUE;
6352 					} else {
6353 						is_external = TRUE;
6354 					}
6355 				}
6356 			}
6357 
6358 			pvh_unlock(pai);
6359 
6360 			if (pp_attr_bits != 0) {
6361 				ppattr_pa_set_bits(pa, pp_attr_bits);
6362 			}
6363 
6364 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6365 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6366 
6367 				if (is_internal) {
6368 					/*
6369 					 * Make corresponding adjustments to
6370 					 * phys_footprint statistics.
6371 					 */
6372 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6373 					if (is_altacct) {
6374 						/*
6375 						 * If this page is internal and
6376 						 * in an IOKit region, credit
6377 						 * the task's total count of
6378 						 * dirty, internal IOKit pages.
6379 						 * It should *not* count towards
6380 						 * the task's total physical
6381 						 * memory footprint, because
6382 						 * this entire region was
6383 						 * already billed to the task
6384 						 * at the time the mapping was
6385 						 * created.
6386 						 *
6387 						 * Put another way, this is
6388 						 * internal++ and
6389 						 * alternate_accounting++, so
6390 						 * net effect on phys_footprint
6391 						 * is 0. That means: don't
6392 						 * touch phys_footprint here.
6393 						 */
6394 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6395 					} else {
6396 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6397 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6398 							skip_footprint_debit = true;
6399 						} else {
6400 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6401 						}
6402 					}
6403 				}
6404 				if (is_reusable) {
6405 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6406 				} else if (is_external) {
6407 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6408 				}
6409 			}
6410 		} else {
6411 			if (prot & VM_PROT_EXECUTE) {
6412 				kr = KERN_FAILURE;
6413 				break;
6414 			}
6415 
6416 			wimg_bits = pmap_cache_attributes(pn);
6417 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6418 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6419 			}
6420 
6421 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6422 
6423 #if XNU_MONITOR
6424 			pte = pmap_construct_io_pte(pa, pte);
6425 
6426 			/**
6427 			 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6428 			 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6429 			 * created and later removed.  We must therefore prevent an attacker from downgrading a
6430 			 * a writable mapping in order to allow it to be removed and remapped to something else.
6431 			 */
6432 			if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6433 			    ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6434 			    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6435 			    (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6436 				panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6437 				    __func__, (uint64_t)pte_to_pa(spte));
6438 			}
6439 #endif
6440 
6441 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6442 			if (committed) {
6443 				had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6444 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6445 
6446 				/**
6447 				 * If there was already a valid pte here then we reuse its
6448 				 * reference on the ptd and drop the one that we took above.
6449 				 */
6450 				drop_refcnt = had_valid_mapping;
6451 			}
6452 		}
6453 		if (committed) {
6454 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6455 				assert(pmap != kernel_pmap);
6456 
6457 				/* One less "compressed" */
6458 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6459 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6460 
6461 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6462 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6463 				} else if (!skip_footprint_debit) {
6464 					/* Was part of the footprint */
6465 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6466 				}
6467 				/* The old entry held a reference so drop the extra one that we took above. */
6468 				drop_refcnt = true;
6469 			}
6470 		}
6471 	}
6472 
6473 	if (drop_refcnt && refcnt != NULL) {
6474 		assert(refcnt_updated);
6475 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6476 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6477 		}
6478 	}
6479 
6480 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6481 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6482 	}
6483 
6484 	pmap_unlock(pmap, lock_mode);
6485 
6486 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6487 		pmap_phys_write_disable(v);
6488 	}
6489 
6490 	return kr;
6491 }
6492 
6493 kern_return_t
6494 pmap_enter_options_addr(
6495 	pmap_t pmap,
6496 	vm_map_address_t v,
6497 	pmap_paddr_t pa,
6498 	vm_prot_t prot,
6499 	vm_prot_t fault_type,
6500 	unsigned int flags,
6501 	boolean_t wired,
6502 	unsigned int options,
6503 	__unused void   *arg,
6504 	__unused pmap_mapping_type_t mapping_type)
6505 {
6506 	kern_return_t kr = KERN_FAILURE;
6507 
6508 
6509 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6510 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6511 
6512 
6513 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6514 	do {
6515 #if XNU_MONITOR
6516 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6517 #else
6518 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6519 #endif
6520 
6521 		if (kr == KERN_RESOURCE_SHORTAGE) {
6522 #if XNU_MONITOR
6523 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6524 #endif
6525 			if (nowait_requested) {
6526 				break;
6527 			}
6528 		}
6529 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6530 
6531 #if XNU_MONITOR
6532 	pmap_ledger_check_balance(pmap);
6533 #endif
6534 
6535 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6536 
6537 	return kr;
6538 }
6539 
6540 kern_return_t
6541 pmap_enter_options(
6542 	pmap_t pmap,
6543 	vm_map_address_t v,
6544 	ppnum_t pn,
6545 	vm_prot_t prot,
6546 	vm_prot_t fault_type,
6547 	unsigned int flags,
6548 	boolean_t wired,
6549 	unsigned int options,
6550 	__unused void   *arg,
6551 	pmap_mapping_type_t mapping_type)
6552 {
6553 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6554 }
6555 
6556 /*
6557  *	Routine:	pmap_change_wiring
6558  *	Function:	Change the wiring attribute for a map/virtual-address
6559  *			pair.
6560  *	In/out conditions:
6561  *			The mapping must already exist in the pmap.
6562  */
6563 MARK_AS_PMAP_TEXT kern_return_t
6564 pmap_change_wiring_internal(
6565 	pmap_t pmap,
6566 	vm_map_address_t v,
6567 	boolean_t wired)
6568 {
6569 	pt_entry_t     *pte_p;
6570 	pmap_paddr_t    pa;
6571 
6572 	validate_pmap_mutable(pmap);
6573 
6574 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6575 		return KERN_ABORTED;
6576 	}
6577 
6578 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6579 
6580 	pte_p = pmap_pte(pmap, v);
6581 	if (pte_p == PT_ENTRY_NULL) {
6582 		if (!wired) {
6583 			/*
6584 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6585 			 * may have been freed by a remove operation.
6586 			 */
6587 			goto pmap_change_wiring_return;
6588 		} else {
6589 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6590 		}
6591 	}
6592 	/*
6593 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6594 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6595 	 */
6596 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6597 
6598 	while (pa_valid(pa)) {
6599 		pmap_paddr_t new_pa;
6600 
6601 		pvh_lock(pa_index(pa));
6602 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6603 
6604 		if (pa == new_pa) {
6605 			break;
6606 		}
6607 
6608 		pvh_unlock(pa_index(pa));
6609 		pa = new_pa;
6610 	}
6611 
6612 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6613 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6614 		if (!wired) {
6615 			/* PTE cleared by prior remove/disconnect operation */
6616 			goto pmap_change_wiring_cleanup;
6617 		} else {
6618 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6619 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6620 		}
6621 	}
6622 
6623 	assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6624 	if (wired != pte_is_wired(*pte_p)) {
6625 		pte_set_wired(pmap, pte_p, wired);
6626 		if (pmap != kernel_pmap) {
6627 			if (wired) {
6628 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6629 			} else if (!wired) {
6630 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6631 			}
6632 		}
6633 	}
6634 
6635 pmap_change_wiring_cleanup:
6636 	if (pa_valid(pa)) {
6637 		pvh_unlock(pa_index(pa));
6638 	}
6639 
6640 pmap_change_wiring_return:
6641 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6642 
6643 	return KERN_SUCCESS;
6644 }
6645 
6646 void
6647 pmap_change_wiring(
6648 	pmap_t pmap,
6649 	vm_map_address_t v,
6650 	boolean_t wired)
6651 {
6652 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6653 	pmap_verify_preemptible();
6654 
6655 	kern_return_t kr = KERN_FAILURE;
6656 #if XNU_MONITOR
6657 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6658 	do {
6659 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6660 	} while (kr == KERN_ABORTED);
6661 
6662 	pmap_ledger_check_balance(pmap);
6663 #else
6664 	/* Since we verified preemptibility, call the helper only once. */
6665 	kr = pmap_change_wiring_internal(pmap, v, wired);
6666 #endif
6667 
6668 	if (kr != KERN_SUCCESS) {
6669 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6670 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6671 	}
6672 }
6673 
6674 MARK_AS_PMAP_TEXT pmap_paddr_t
6675 pmap_find_pa_internal(
6676 	pmap_t pmap,
6677 	addr64_t va)
6678 {
6679 	pmap_paddr_t    pa = 0;
6680 
6681 	validate_pmap(pmap);
6682 
6683 	if (pmap != kernel_pmap) {
6684 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6685 	}
6686 
6687 	pa = pmap_vtophys(pmap, va);
6688 
6689 	if (pmap != kernel_pmap) {
6690 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6691 	}
6692 
6693 	return pa;
6694 }
6695 
6696 pmap_paddr_t
6697 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6698 {
6699 	pmap_paddr_t pa = 0;
6700 
6701 	if (pmap == kernel_pmap) {
6702 		pa = mmu_kvtop(va);
6703 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6704 		/*
6705 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6706 		 * translation even if PAN would prevent kernel access through the translation.
6707 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6708 		 */
6709 		pa = mmu_uvtop(va);
6710 	}
6711 	return pa;
6712 }
6713 
6714 pmap_paddr_t
6715 pmap_find_pa(
6716 	pmap_t pmap,
6717 	addr64_t va)
6718 {
6719 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6720 
6721 	if (pa != 0) {
6722 		return pa;
6723 	}
6724 
6725 	if (not_in_kdp) {
6726 #if XNU_MONITOR
6727 		return pmap_find_pa_ppl(pmap, va);
6728 #else
6729 		return pmap_find_pa_internal(pmap, va);
6730 #endif
6731 	} else {
6732 		return pmap_vtophys(pmap, va);
6733 	}
6734 }
6735 
6736 ppnum_t
6737 pmap_find_phys_nofault(
6738 	pmap_t pmap,
6739 	addr64_t va)
6740 {
6741 	ppnum_t ppn;
6742 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6743 	return ppn;
6744 }
6745 
6746 ppnum_t
6747 pmap_find_phys(
6748 	pmap_t pmap,
6749 	addr64_t va)
6750 {
6751 	ppnum_t ppn;
6752 	ppn = atop(pmap_find_pa(pmap, va));
6753 	return ppn;
6754 }
6755 
6756 /**
6757  * Translate a kernel virtual address into a physical address.
6758  *
6759  * @param va The kernel virtual address to translate. Does not work on user
6760  *           virtual addresses.
6761  *
6762  * @return The physical address if the translation was successful, or zero if
6763  *         no valid mappings were found for the given virtual address.
6764  */
6765 pmap_paddr_t
6766 kvtophys(vm_offset_t va)
6767 {
6768 	/**
6769 	 * Attempt to do the translation first in hardware using the AT (address
6770 	 * translation) instruction. This will attempt to use the MMU to do the
6771 	 * translation for us.
6772 	 */
6773 	pmap_paddr_t pa = mmu_kvtop(va);
6774 
6775 	if (pa) {
6776 		return pa;
6777 	}
6778 
6779 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6780 	return pmap_vtophys(kernel_pmap, va);
6781 }
6782 
6783 /**
6784  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6785  * points to a non-kernel-managed physical page, then this call will panic().
6786  *
6787  * @note The output of this function is guaranteed to be a kernel-managed
6788  *       physical page, which means it's safe to pass the output directly to
6789  *       pa_index() to create a physical address index for various pmap data
6790  *       structures.
6791  *
6792  * @param va The kernel virtual address to translate. Does not work on user
6793  *           virtual addresses.
6794  *
6795  * @return The translated physical address for the given virtual address.
6796  */
6797 pmap_paddr_t
6798 kvtophys_nofail(vm_offset_t va)
6799 {
6800 	pmap_paddr_t pa = kvtophys(va);
6801 
6802 	if (!pa_valid(pa)) {
6803 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6804 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6805 	}
6806 
6807 	return pa;
6808 }
6809 
6810 pmap_paddr_t
6811 pmap_vtophys(
6812 	pmap_t pmap,
6813 	addr64_t va)
6814 {
6815 	if ((va < pmap->min) || (va >= pmap->max)) {
6816 		return 0;
6817 	}
6818 
6819 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6820 
6821 	tt_entry_t * ttp = NULL;
6822 	tt_entry_t * ttep = NULL;
6823 	tt_entry_t   tte = ARM_TTE_EMPTY;
6824 	pmap_paddr_t pa = 0;
6825 	unsigned int cur_level;
6826 
6827 	ttp = pmap->tte;
6828 
6829 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6830 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6831 
6832 		tte = *ttep;
6833 
6834 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6835 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6836 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6837 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6838 
6839 		if ((tte & valid_mask) != valid_mask) {
6840 			return (pmap_paddr_t) 0;
6841 		}
6842 
6843 		/* This detects both leaf entries and intermediate block mappings. */
6844 		if ((tte & type_mask) == type_block) {
6845 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6846 			break;
6847 		}
6848 
6849 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6850 	}
6851 
6852 	return pa;
6853 }
6854 
6855 /*
6856  *	pmap_init_pte_page - Initialize a page table page.
6857  */
6858 MARK_AS_PMAP_TEXT void
6859 pmap_init_pte_page(
6860 	pmap_t pmap,
6861 	pt_entry_t *pte_p,
6862 	vm_offset_t va,
6863 	unsigned int ttlevel,
6864 	boolean_t alloc_ptd)
6865 {
6866 	pt_desc_t   *ptdp = NULL;
6867 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6868 
6869 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6870 		if (alloc_ptd) {
6871 			/*
6872 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6873 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6874 			 * bootstrap request, so we check for an existing PTD here.
6875 			 */
6876 			ptdp = ptd_alloc(pmap);
6877 			if (ptdp == NULL) {
6878 				panic("%s: unable to allocate PTD", __func__);
6879 			}
6880 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6881 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6882 			pvh_set_flags(pvh, 0);
6883 		} else {
6884 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
6885 		}
6886 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6887 		ptdp = pvh_ptd(pvh);
6888 	} else {
6889 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6890 	}
6891 
6892 	// below barrier ensures previous updates to the page are visible to PTW before
6893 	// it is linked to the PTE of previous level
6894 	__builtin_arm_dmb(DMB_ISHST);
6895 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6896 }
6897 
6898 /*
6899  *	Routine:	pmap_expand
6900  *
6901  *	Expands a pmap to be able to map the specified virtual address.
6902  *
6903  *	Allocates new memory for the default (COARSE) translation table
6904  *	entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6905  *	also allocates space for the corresponding pv entries.
6906  *
6907  *	Nothing should be locked.
6908  */
6909 MARK_AS_PMAP_TEXT static kern_return_t
6910 pmap_expand(
6911 	pmap_t pmap,
6912 	vm_map_address_t v,
6913 	unsigned int options,
6914 	unsigned int level)
6915 {
6916 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6917 
6918 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6919 		return KERN_INVALID_ADDRESS;
6920 	}
6921 	pmap_paddr_t    pa;
6922 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
6923 	tt_entry_t              *tte_p;
6924 	tt_entry_t              *tt_p;
6925 
6926 	pa = 0x0ULL;
6927 	tt_p =  (tt_entry_t *)NULL;
6928 
6929 	for (; ttlevel < level; ttlevel++) {
6930 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6931 			return KERN_ABORTED;
6932 		}
6933 
6934 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6935 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6936 			kern_return_t ret;
6937 			while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
6938 				if (options & PMAP_OPTIONS_NOWAIT) {
6939 					/* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
6940 					return ret;
6941 				}
6942 #if XNU_MONITOR
6943 				panic("%s: failed to allocate tt, "
6944 				    "pmap=%p, v=%p, options=0x%x, level=%u",
6945 				    __FUNCTION__,
6946 				    pmap, (void *)v, options, level);
6947 #else
6948 				VM_PAGE_WAIT();
6949 #endif
6950 			}
6951 
6952 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6953 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6954 				return KERN_ABORTED;
6955 			}
6956 
6957 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6958 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6959 				pa = kvtophys_nofail((vm_offset_t)tt_p);
6960 				tte_p = pmap_ttne(pmap, ttlevel, v);
6961 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6962 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6963 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6964 				pa = 0x0ULL;
6965 				tt_p = (tt_entry_t *)NULL;
6966 			}
6967 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6968 		} else {
6969 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6970 		}
6971 
6972 		if (tt_p != (tt_entry_t *)NULL) {
6973 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6974 			tt_p = (tt_entry_t *)NULL;
6975 		}
6976 	}
6977 
6978 	return KERN_SUCCESS;
6979 }
6980 
6981 /*
6982  *	Routine:	pmap_gc
6983  *	Function:
6984  *              Pmap garbage collection
6985  *		Called by the pageout daemon when pages are scarce.
6986  *
6987  */
6988 void
6989 pmap_gc(void)
6990 {
6991 	/*
6992 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6993 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6994 	 * or may contain wired mappings.  However, with the relatively recent change to
6995 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6996 	 * page, it may make sense to call that function here.
6997 	 */
6998 }
6999 
7000 /*
7001  *      By default, don't attempt pmap GC more frequently
7002  *      than once / 1 minutes.
7003  */
7004 
7005 void
7006 compute_pmap_gc_throttle(
7007 	void *arg __unused)
7008 {
7009 }
7010 
7011 /*
7012  * pmap_attribute_cache_sync(vm_offset_t pa)
7013  *
7014  * Invalidates all of the instruction cache on a physical page and
7015  * pushes any dirty data from the data cache for the same physical page
7016  */
7017 
7018 kern_return_t
7019 pmap_attribute_cache_sync(
7020 	ppnum_t pp,
7021 	vm_size_t size,
7022 	__unused vm_machine_attribute_t attribute,
7023 	__unused vm_machine_attribute_val_t * value)
7024 {
7025 	if (size > PAGE_SIZE) {
7026 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7027 	} else {
7028 		cache_sync_page(pp);
7029 	}
7030 
7031 	return KERN_SUCCESS;
7032 }
7033 
7034 /*
7035  * pmap_sync_page_data_phys(ppnum_t pp)
7036  *
7037  * Invalidates all of the instruction cache on a physical page and
7038  * pushes any dirty data from the data cache for the same physical page
7039  */
7040 void
7041 pmap_sync_page_data_phys(
7042 	ppnum_t pp)
7043 {
7044 	cache_sync_page(pp);
7045 }
7046 
7047 /*
7048  * pmap_sync_page_attributes_phys(ppnum_t pp)
7049  *
7050  * Write back and invalidate all cachelines on a physical page.
7051  */
7052 void
7053 pmap_sync_page_attributes_phys(
7054 	ppnum_t pp)
7055 {
7056 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7057 }
7058 
7059 #if CONFIG_COREDUMP
7060 /* temporary workaround */
7061 boolean_t
7062 coredumpok(
7063 	vm_map_t map,
7064 	mach_vm_offset_t va)
7065 {
7066 	pt_entry_t     *pte_p;
7067 	pt_entry_t      spte;
7068 
7069 	pte_p = pmap_pte(map->pmap, va);
7070 	if (0 == pte_p) {
7071 		return FALSE;
7072 	}
7073 	if (vm_map_entry_has_device_pager(map, va)) {
7074 		return FALSE;
7075 	}
7076 	spte = *pte_p;
7077 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7078 }
7079 #endif
7080 
7081 void
7082 fillPage(
7083 	ppnum_t pn,
7084 	unsigned int fill)
7085 {
7086 	unsigned int   *addr;
7087 	int             count;
7088 
7089 	addr = (unsigned int *) phystokv(ptoa(pn));
7090 	count = PAGE_SIZE / sizeof(unsigned int);
7091 	while (count--) {
7092 		*addr++ = fill;
7093 	}
7094 }
7095 
7096 extern void     mapping_set_mod(ppnum_t pn);
7097 
7098 void
7099 mapping_set_mod(
7100 	ppnum_t pn)
7101 {
7102 	pmap_set_modify(pn);
7103 }
7104 
7105 extern void     mapping_set_ref(ppnum_t pn);
7106 
7107 void
7108 mapping_set_ref(
7109 	ppnum_t pn)
7110 {
7111 	pmap_set_reference(pn);
7112 }
7113 
7114 /*
7115  * Clear specified attribute bits.
7116  *
7117  * Try to force an arm_fast_fault() for all mappings of
7118  * the page - to force attributes to be set again at fault time.
7119  * If the forcing succeeds, clear the cached bits at the head.
7120  * Otherwise, something must have been wired, so leave the cached
7121  * attributes alone.
7122  */
7123 MARK_AS_PMAP_TEXT static void
7124 phys_attribute_clear_with_flush_range(
7125 	ppnum_t         pn,
7126 	unsigned int    bits,
7127 	int             options,
7128 	void            *arg,
7129 	pmap_tlb_flush_range_t *flush_range)
7130 {
7131 	pmap_paddr_t    pa = ptoa(pn);
7132 	vm_prot_t       allow_mode = VM_PROT_ALL;
7133 
7134 #if XNU_MONITOR
7135 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7136 		panic("%s: illegal request, "
7137 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7138 		    __FUNCTION__,
7139 		    pn, bits, options, arg, flush_range);
7140 	}
7141 #endif
7142 	if ((arg != NULL) || (flush_range != NULL)) {
7143 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7144 	}
7145 
7146 	if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7147 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7148 		    "invalid options",
7149 		    pn, bits, options, arg, flush_range);
7150 	}
7151 
7152 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7153 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7154 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7155 		    "should not clear 'modified' without flushing TLBs",
7156 		    pn, bits, options, arg, flush_range);
7157 	}
7158 
7159 	assert(pn != vm_page_fictitious_addr);
7160 
7161 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7162 		assert(bits == PP_ATTR_MODIFIED);
7163 
7164 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7165 		/*
7166 		 * We short circuit this case; it should not need to
7167 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7168 		 * pmap_page_protect has taken care of resetting
7169 		 * the state so that we'll see the next write as a fault to
7170 		 * the VM (i.e. we don't want a fast fault).
7171 		 */
7172 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7173 		return;
7174 	}
7175 	if (bits & PP_ATTR_REFERENCED) {
7176 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7177 	}
7178 	if (bits & PP_ATTR_MODIFIED) {
7179 		allow_mode &= ~VM_PROT_WRITE;
7180 	}
7181 
7182 	if (bits == PP_ATTR_NOENCRYPT) {
7183 		/*
7184 		 * We short circuit this case; it should not need to
7185 		 * invoke arm_force_fast_fault, so just clear and
7186 		 * return.  On ARM, this bit is just a debugging aid.
7187 		 */
7188 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7189 		return;
7190 	}
7191 
7192 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7193 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7194 	}
7195 }
7196 
7197 MARK_AS_PMAP_TEXT void
7198 phys_attribute_clear_internal(
7199 	ppnum_t         pn,
7200 	unsigned int    bits,
7201 	int             options,
7202 	void            *arg)
7203 {
7204 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7205 }
7206 
7207 #if __ARM_RANGE_TLBI__
7208 MARK_AS_PMAP_TEXT static vm_map_address_t
7209 phys_attribute_clear_twig_internal(
7210 	pmap_t pmap,
7211 	vm_map_address_t start,
7212 	vm_map_address_t end,
7213 	unsigned int bits,
7214 	unsigned int options,
7215 	pmap_tlb_flush_range_t *flush_range)
7216 {
7217 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7218 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7219 	assert(end >= start);
7220 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7221 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7222 	vm_map_address_t va = start;
7223 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7224 	tt_entry_t     *tte_p;
7225 	tte_p = pmap_tte(pmap, start);
7226 	unsigned int npages = 0;
7227 
7228 	if (tte_p == (tt_entry_t *) NULL) {
7229 		return end;
7230 	}
7231 
7232 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7233 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7234 
7235 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7236 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7237 		assert(end_pte_p >= start_pte_p);
7238 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7239 			if (__improbable(npages++ && pmap_pending_preemption())) {
7240 				return va;
7241 			}
7242 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7243 			if (pa_valid(pa)) {
7244 				ppnum_t pn = (ppnum_t) atop(pa);
7245 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7246 			}
7247 		}
7248 	}
7249 	return end;
7250 }
7251 
7252 MARK_AS_PMAP_TEXT vm_map_address_t
7253 phys_attribute_clear_range_internal(
7254 	pmap_t pmap,
7255 	vm_map_address_t start,
7256 	vm_map_address_t end,
7257 	unsigned int bits,
7258 	unsigned int options)
7259 {
7260 	if (__improbable(end < start)) {
7261 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7262 	}
7263 	validate_pmap_mutable(pmap);
7264 
7265 	vm_map_address_t va = start;
7266 	pmap_tlb_flush_range_t flush_range = {
7267 		.ptfr_pmap = pmap,
7268 		.ptfr_start = start,
7269 		.ptfr_end = end,
7270 		.ptfr_flush_needed = false
7271 	};
7272 
7273 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7274 		return va;
7275 	}
7276 
7277 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7278 
7279 	while (va < end) {
7280 		vm_map_address_t curr_end;
7281 
7282 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7283 		if (curr_end > end) {
7284 			curr_end = end;
7285 		}
7286 
7287 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7288 		if ((va < curr_end) || pmap_pending_preemption()) {
7289 			break;
7290 		}
7291 	}
7292 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7293 	if (flush_range.ptfr_flush_needed) {
7294 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7295 			flush_range.ptfr_start,
7296 			flush_range.ptfr_end - flush_range.ptfr_start,
7297 			flush_range.ptfr_pmap,
7298 			true,
7299 			false);
7300 		sync_tlb_flush();
7301 	}
7302 	return va;
7303 }
7304 
7305 static void
7306 phys_attribute_clear_range(
7307 	pmap_t pmap,
7308 	vm_map_address_t start,
7309 	vm_map_address_t end,
7310 	unsigned int bits,
7311 	unsigned int options)
7312 {
7313 	/*
7314 	 * We allow single-page requests to execute non-preemptibly,
7315 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7316 	 * operation, and there are a couple of special use cases that
7317 	 * require a non-preemptible single-page operation.
7318 	 */
7319 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7320 		pmap_verify_preemptible();
7321 	}
7322 
7323 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7324 
7325 	while (start < end) {
7326 #if XNU_MONITOR
7327 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7328 #else
7329 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7330 #endif
7331 	}
7332 
7333 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7334 }
7335 #endif /* __ARM_RANGE_TLBI__ */
7336 
7337 static void
7338 phys_attribute_clear(
7339 	ppnum_t         pn,
7340 	unsigned int    bits,
7341 	int             options,
7342 	void            *arg)
7343 {
7344 	/*
7345 	 * Do we really want this tracepoint?  It will be extremely chatty.
7346 	 * Also, should we have a corresponding trace point for the set path?
7347 	 */
7348 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7349 
7350 #if XNU_MONITOR
7351 	phys_attribute_clear_ppl(pn, bits, options, arg);
7352 #else
7353 	phys_attribute_clear_internal(pn, bits, options, arg);
7354 #endif
7355 
7356 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7357 }
7358 
7359 /*
7360  *	Set specified attribute bits.
7361  *
7362  *	Set cached value in the pv head because we have
7363  *	no per-mapping hardware support for referenced and
7364  *	modify bits.
7365  */
7366 MARK_AS_PMAP_TEXT void
7367 phys_attribute_set_internal(
7368 	ppnum_t pn,
7369 	unsigned int bits)
7370 {
7371 	pmap_paddr_t    pa = ptoa(pn);
7372 	assert(pn != vm_page_fictitious_addr);
7373 
7374 #if XNU_MONITOR
7375 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7376 		panic("%s: illegal request, "
7377 		    "pn=%u, bits=%#x",
7378 		    __FUNCTION__,
7379 		    pn, bits);
7380 	}
7381 #endif
7382 
7383 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7384 
7385 	return;
7386 }
7387 
7388 static void
7389 phys_attribute_set(
7390 	ppnum_t pn,
7391 	unsigned int bits)
7392 {
7393 #if XNU_MONITOR
7394 	phys_attribute_set_ppl(pn, bits);
7395 #else
7396 	phys_attribute_set_internal(pn, bits);
7397 #endif
7398 }
7399 
7400 
7401 /*
7402  *	Check specified attribute bits.
7403  *
7404  *	use the software cached bits (since no hw support).
7405  */
7406 static boolean_t
7407 phys_attribute_test(
7408 	ppnum_t pn,
7409 	unsigned int bits)
7410 {
7411 	pmap_paddr_t    pa = ptoa(pn);
7412 	assert(pn != vm_page_fictitious_addr);
7413 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7414 }
7415 
7416 
7417 /*
7418  *	Set the modify/reference bits on the specified physical page.
7419  */
7420 void
7421 pmap_set_modify(ppnum_t pn)
7422 {
7423 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7424 }
7425 
7426 
7427 /*
7428  *	Clear the modify bits on the specified physical page.
7429  */
7430 void
7431 pmap_clear_modify(
7432 	ppnum_t pn)
7433 {
7434 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7435 }
7436 
7437 
7438 /*
7439  *	pmap_is_modified:
7440  *
7441  *	Return whether or not the specified physical page is modified
7442  *	by any physical maps.
7443  */
7444 boolean_t
7445 pmap_is_modified(
7446 	ppnum_t pn)
7447 {
7448 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7449 }
7450 
7451 
7452 /*
7453  *	Set the reference bit on the specified physical page.
7454  */
7455 static void
7456 pmap_set_reference(
7457 	ppnum_t pn)
7458 {
7459 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7460 }
7461 
7462 /*
7463  *	Clear the reference bits on the specified physical page.
7464  */
7465 void
7466 pmap_clear_reference(
7467 	ppnum_t pn)
7468 {
7469 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7470 }
7471 
7472 
7473 /*
7474  *	pmap_is_referenced:
7475  *
7476  *	Return whether or not the specified physical page is referenced
7477  *	by any physical maps.
7478  */
7479 boolean_t
7480 pmap_is_referenced(
7481 	ppnum_t pn)
7482 {
7483 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7484 }
7485 
7486 /*
7487  * pmap_get_refmod(phys)
7488  *  returns the referenced and modified bits of the specified
7489  *  physical page.
7490  */
7491 unsigned int
7492 pmap_get_refmod(
7493 	ppnum_t pn)
7494 {
7495 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7496 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7497 }
7498 
7499 static inline unsigned int
7500 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7501 {
7502 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7503 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7504 }
7505 
7506 /*
7507  * pmap_clear_refmod(phys, mask)
7508  *  clears the referenced and modified bits as specified by the mask
7509  *  of the specified physical page.
7510  */
7511 void
7512 pmap_clear_refmod_options(
7513 	ppnum_t         pn,
7514 	unsigned int    mask,
7515 	unsigned int    options,
7516 	void            *arg)
7517 {
7518 	unsigned int    bits;
7519 
7520 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7521 	phys_attribute_clear(pn, bits, options, arg);
7522 }
7523 
7524 /*
7525  * Perform pmap_clear_refmod_options on a virtual address range.
7526  * The operation will be performed in bulk & tlb flushes will be coalesced
7527  * if possible.
7528  *
7529  * Returns true if the operation is supported on this platform.
7530  * If this function returns false, the operation is not supported and
7531  * nothing has been modified in the pmap.
7532  */
7533 bool
7534 pmap_clear_refmod_range_options(
7535 	pmap_t pmap __unused,
7536 	vm_map_address_t start __unused,
7537 	vm_map_address_t end __unused,
7538 	unsigned int mask __unused,
7539 	unsigned int options __unused)
7540 {
7541 #if __ARM_RANGE_TLBI__
7542 	unsigned int    bits;
7543 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7544 	phys_attribute_clear_range(pmap, start, end, bits, options);
7545 	return true;
7546 #else /* __ARM_RANGE_TLBI__ */
7547 #pragma unused(pmap, start, end, mask, options)
7548 	/*
7549 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7550 	 * contiguous range of addresses. This is large performance improvement on
7551 	 * platforms that support ranged tlbi instructions. But on older platforms,
7552 	 * we can only flush per-page or the entire asid. So we currently
7553 	 * only support this operation on platforms that support ranged tlbi.
7554 	 * instructions. On other platforms, we require that
7555 	 * the VM modify the bits on a per-page basis.
7556 	 */
7557 	return false;
7558 #endif /* __ARM_RANGE_TLBI__ */
7559 }
7560 
7561 void
7562 pmap_clear_refmod(
7563 	ppnum_t pn,
7564 	unsigned int mask)
7565 {
7566 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7567 }
7568 
7569 unsigned int
7570 pmap_disconnect_options(
7571 	ppnum_t pn,
7572 	unsigned int options,
7573 	void *arg)
7574 {
7575 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7576 		/*
7577 		 * On ARM, the "modified" bit is managed by software, so
7578 		 * we know up-front if the physical page is "modified",
7579 		 * without having to scan all the PTEs pointing to it.
7580 		 * The caller should have made the VM page "busy" so noone
7581 		 * should be able to establish any new mapping and "modify"
7582 		 * the page behind us.
7583 		 */
7584 		if (pmap_is_modified(pn)) {
7585 			/*
7586 			 * The page has been modified and will be sent to
7587 			 * the VM compressor.
7588 			 */
7589 			options |= PMAP_OPTIONS_COMPRESSOR;
7590 		} else {
7591 			/*
7592 			 * The page hasn't been modified and will be freed
7593 			 * instead of compressed.
7594 			 */
7595 		}
7596 	}
7597 
7598 	/* disconnect the page */
7599 	pmap_page_protect_options(pn, 0, options, arg);
7600 
7601 	/* return ref/chg status */
7602 	return pmap_get_refmod(pn);
7603 }
7604 
7605 /*
7606  *	Routine:
7607  *		pmap_disconnect
7608  *
7609  *	Function:
7610  *		Disconnect all mappings for this page and return reference and change status
7611  *		in generic format.
7612  *
7613  */
7614 unsigned int
7615 pmap_disconnect(
7616 	ppnum_t pn)
7617 {
7618 	pmap_page_protect(pn, 0);       /* disconnect the page */
7619 	return pmap_get_refmod(pn);   /* return ref/chg status */
7620 }
7621 
7622 boolean_t
7623 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7624 {
7625 	if (ptoa(first) >= vm_last_phys) {
7626 		return FALSE;
7627 	}
7628 	if (ptoa(last) < vm_first_phys) {
7629 		return FALSE;
7630 	}
7631 
7632 	return TRUE;
7633 }
7634 
7635 /*
7636  * The state maintained by the noencrypt functions is used as a
7637  * debugging aid on ARM.  This incurs some overhead on the part
7638  * of the caller.  A special case check in phys_attribute_clear
7639  * (the most expensive path) currently minimizes this overhead,
7640  * but stubbing these functions out on RELEASE kernels yields
7641  * further wins.
7642  */
7643 boolean_t
7644 pmap_is_noencrypt(
7645 	ppnum_t pn)
7646 {
7647 #if DEVELOPMENT || DEBUG
7648 	boolean_t result = FALSE;
7649 
7650 	if (!pa_valid(ptoa(pn))) {
7651 		return FALSE;
7652 	}
7653 
7654 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7655 
7656 	return result;
7657 #else
7658 #pragma unused(pn)
7659 	return FALSE;
7660 #endif
7661 }
7662 
7663 void
7664 pmap_set_noencrypt(
7665 	ppnum_t pn)
7666 {
7667 #if DEVELOPMENT || DEBUG
7668 	if (!pa_valid(ptoa(pn))) {
7669 		return;
7670 	}
7671 
7672 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7673 #else
7674 #pragma unused(pn)
7675 #endif
7676 }
7677 
7678 void
7679 pmap_clear_noencrypt(
7680 	ppnum_t pn)
7681 {
7682 #if DEVELOPMENT || DEBUG
7683 	if (!pa_valid(ptoa(pn))) {
7684 		return;
7685 	}
7686 
7687 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7688 #else
7689 #pragma unused(pn)
7690 #endif
7691 }
7692 
7693 #if XNU_MONITOR
7694 boolean_t
7695 pmap_is_monitor(ppnum_t pn)
7696 {
7697 	assert(pa_valid(ptoa(pn)));
7698 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7699 }
7700 #endif
7701 
7702 void
7703 pmap_lock_phys_page(ppnum_t pn)
7704 {
7705 #if !XNU_MONITOR
7706 	unsigned int    pai;
7707 	pmap_paddr_t    phys = ptoa(pn);
7708 
7709 	if (pa_valid(phys)) {
7710 		pai = pa_index(phys);
7711 		pvh_lock(pai);
7712 	} else
7713 #else
7714 	(void)pn;
7715 #endif
7716 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7717 }
7718 
7719 
7720 void
7721 pmap_unlock_phys_page(ppnum_t pn)
7722 {
7723 #if !XNU_MONITOR
7724 	unsigned int    pai;
7725 	pmap_paddr_t    phys = ptoa(pn);
7726 
7727 	if (pa_valid(phys)) {
7728 		pai = pa_index(phys);
7729 		pvh_unlock(pai);
7730 	} else
7731 #else
7732 	(void)pn;
7733 #endif
7734 	{ simple_unlock(&phys_backup_lock);}
7735 }
7736 
7737 MARK_AS_PMAP_TEXT static void
7738 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7739 {
7740 	if (pmap != kernel_pmap) {
7741 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7742 		cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7743 		    NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7744 		cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7745 		cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7746 #if __ARM_MIXED_PAGE_SIZE__
7747 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7748 #endif
7749 	}
7750 
7751 
7752 #if __ARM_MIXED_PAGE_SIZE__
7753 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7754 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7755 	}
7756 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7757 
7758 
7759 	if (pmap != kernel_pmap) {
7760 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7761 	} else if (!pmap_user_ttb_is_clear()) {
7762 		pmap_clear_user_ttb_internal();
7763 	}
7764 }
7765 
7766 MARK_AS_PMAP_TEXT void
7767 pmap_clear_user_ttb_internal(void)
7768 {
7769 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7770 }
7771 
7772 void
7773 pmap_clear_user_ttb(void)
7774 {
7775 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7776 #if XNU_MONITOR
7777 	pmap_clear_user_ttb_ppl();
7778 #else
7779 	pmap_clear_user_ttb_internal();
7780 #endif
7781 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7782 }
7783 
7784 
7785 #if defined(__arm64__)
7786 /*
7787  * Marker for use in multi-pass fast-fault PV list processing.
7788  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7789  * these functions, as compressed PTEs should never be present in PV lists.
7790  * Note that this only holds true for arm64; for arm32 we don't have enough
7791  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7792  * and WRITEABLE marker depending on whether the PTE is valid.
7793  */
7794 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7795 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7796 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7797 #endif
7798 
7799 
7800 MARK_AS_PMAP_TEXT static boolean_t
7801 arm_force_fast_fault_with_flush_range(
7802 	ppnum_t         ppnum,
7803 	vm_prot_t       allow_mode,
7804 	int             options,
7805 	pmap_tlb_flush_range_t *flush_range)
7806 {
7807 	pmap_paddr_t     phys = ptoa(ppnum);
7808 	pv_entry_t      *pve_p;
7809 	pt_entry_t      *pte_p;
7810 	unsigned int     pai;
7811 	unsigned int     pass1_updated = 0;
7812 	unsigned int     pass2_updated = 0;
7813 	boolean_t        result;
7814 	pv_entry_t     **pv_h;
7815 	bool             is_reusable;
7816 	bool             ref_fault;
7817 	bool             mod_fault;
7818 	bool             clear_write_fault = false;
7819 	bool             ref_aliases_mod = false;
7820 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7821 
7822 	assert(ppnum != vm_page_fictitious_addr);
7823 
7824 	if (!pa_valid(phys)) {
7825 		return FALSE;   /* Not a managed page. */
7826 	}
7827 
7828 	result = TRUE;
7829 	ref_fault = false;
7830 	mod_fault = false;
7831 	pai = pa_index(phys);
7832 	if (__probable(mustsynch)) {
7833 		pvh_lock(pai);
7834 	}
7835 	pv_h = pai_to_pvh(pai);
7836 
7837 #if XNU_MONITOR
7838 	if (__improbable(ppattr_pa_test_monitor(phys))) {
7839 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7840 	}
7841 #endif
7842 	pte_p = PT_ENTRY_NULL;
7843 	pve_p = PV_ENTRY_NULL;
7844 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7845 		pte_p = pvh_ptep(pv_h);
7846 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7847 		pve_p = pvh_pve_list(pv_h);
7848 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7849 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7850 	}
7851 
7852 	is_reusable = ppattr_test_reusable(pai);
7853 
7854 	/*
7855 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7856 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
7857 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7858 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
7859 	 * operation, TLB invalidation may be handled by the caller so it's possible for
7860 	 * tlb_flush_needed to be true while issue_tlbi is false.
7861 	 */
7862 	bool issue_tlbi = false;
7863 	bool tlb_flush_needed = false;
7864 
7865 	pv_entry_t *orig_pve_p = pve_p;
7866 	pt_entry_t *orig_pte_p = pte_p;
7867 	int pve_ptep_idx = 0;
7868 
7869 	/*
7870 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7871 	 * TLB invalidation in pass 2.
7872 	 */
7873 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7874 		pt_entry_t       spte;
7875 		pt_entry_t       tmplate;
7876 
7877 		if (pve_p != PV_ENTRY_NULL) {
7878 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7879 			if (pte_p == PT_ENTRY_NULL) {
7880 				goto fff_skip_pve_pass1;
7881 			}
7882 		}
7883 
7884 #ifdef PVH_FLAG_IOMMU
7885 		if (pvh_ptep_is_iommu(pte_p)) {
7886 			goto fff_skip_pve_pass1;
7887 		}
7888 #endif
7889 		if (*pte_p == ARM_PTE_EMPTY) {
7890 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7891 		}
7892 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7893 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7894 		}
7895 
7896 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7897 		const pmap_t pmap = ptdp->pmap;
7898 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7899 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7900 
7901 		assert(va >= pmap->min && va < pmap->max);
7902 
7903 		/* update pmap stats and ledgers */
7904 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7905 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7906 		if (is_altacct) {
7907 			/*
7908 			 * We do not track "reusable" status for
7909 			 * "alternate accounting" mappings.
7910 			 */
7911 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7912 		    is_reusable &&
7913 		    is_internal &&
7914 		    pmap != kernel_pmap) {
7915 			/* one less "reusable" */
7916 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7917 			/* one more "internal" */
7918 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7919 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7920 
7921 			/*
7922 			 * Since the page is being marked non-reusable, we assume that it will be
7923 			 * modified soon.  Avoid the cost of another trap to handle the fast
7924 			 * fault when we next write to this page.
7925 			 */
7926 			clear_write_fault = true;
7927 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7928 		    !is_reusable &&
7929 		    is_internal &&
7930 		    pmap != kernel_pmap) {
7931 			/* one more "reusable" */
7932 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7933 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7934 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7935 		}
7936 
7937 		bool wiredskip = pte_is_wired(*pte_p) &&
7938 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7939 
7940 		if (wiredskip) {
7941 			result = FALSE;
7942 			goto fff_skip_pve_pass1;
7943 		}
7944 
7945 		spte = *pte_p;
7946 		tmplate = spte;
7947 
7948 #if HAS_FEAT_XS
7949 		/**
7950 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
7951 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
7952 		 */
7953 		assert(!pte_is_xs(pt_attr, spte));
7954 #endif /* HAS_FEAT_XS */
7955 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7956 			/* read protection sets the pte to fault */
7957 			tmplate =  tmplate & ~ARM_PTE_AF;
7958 			ref_fault = true;
7959 		}
7960 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7961 			/* take away write permission if set */
7962 			if (pmap == kernel_pmap) {
7963 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7964 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7965 					pte_set_was_writeable(tmplate, true);
7966 					mod_fault = true;
7967 				}
7968 			} else {
7969 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7970 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7971 					pte_set_was_writeable(tmplate, true);
7972 					mod_fault = true;
7973 				}
7974 			}
7975 		}
7976 
7977 #if MACH_ASSERT && XNU_MONITOR
7978 		if (is_pte_xprr_protected(pmap, spte)) {
7979 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7980 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7981 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7982 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7983 				    ppnum, options, allow_mode);
7984 			}
7985 		}
7986 #endif /* MACH_ASSERT && XNU_MONITOR */
7987 
7988 		if (result && (tmplate != spte)) {
7989 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7990 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
7991 				tlb_flush_needed = true;
7992 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7993 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7994 #ifdef ARM_PTE_FF_MARKER
7995 					assert(!(spte & ARM_PTE_FF_MARKER));
7996 					tmplate |= ARM_PTE_FF_MARKER;
7997 					++pass1_updated;
7998 #endif
7999 					issue_tlbi = true;
8000 				}
8001 			}
8002 			write_pte_fast(pte_p, tmplate);
8003 		}
8004 
8005 fff_skip_pve_pass1:
8006 		pte_p = PT_ENTRY_NULL;
8007 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8008 			pve_ptep_idx = 0;
8009 			pve_p = pve_next(pve_p);
8010 		}
8011 	}
8012 
8013 	if (tlb_flush_needed) {
8014 		FLUSH_PTE_STRONG();
8015 	}
8016 
8017 	if (!issue_tlbi) {
8018 		goto fff_finish;
8019 	}
8020 
8021 	/* Pass 2: Issue any required TLB invalidations */
8022 	pve_p = orig_pve_p;
8023 	pte_p = orig_pte_p;
8024 	pve_ptep_idx = 0;
8025 
8026 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8027 		if (pve_p != PV_ENTRY_NULL) {
8028 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8029 			if (pte_p == PT_ENTRY_NULL) {
8030 				goto fff_skip_pve_pass2;
8031 			}
8032 		}
8033 
8034 #ifdef PVH_FLAG_IOMMU
8035 		if (pvh_ptep_is_iommu(pte_p)) {
8036 			goto fff_skip_pve_pass2;
8037 		}
8038 #endif
8039 
8040 #ifdef ARM_PTE_FF_MARKER
8041 		pt_entry_t spte = *pte_p;
8042 
8043 		if (!(spte & ARM_PTE_FF_MARKER)) {
8044 			goto fff_skip_pve_pass2;
8045 		} else {
8046 			spte &= (~ARM_PTE_FF_MARKER);
8047 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8048 			write_pte_fast(pte_p, spte);
8049 			++pass2_updated;
8050 		}
8051 #endif
8052 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8053 		const pmap_t pmap = ptdp->pmap;
8054 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8055 
8056 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8057 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8058 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8059 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8060 		}
8061 
8062 fff_skip_pve_pass2:
8063 		pte_p = PT_ENTRY_NULL;
8064 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8065 			pve_ptep_idx = 0;
8066 			pve_p = pve_next(pve_p);
8067 		}
8068 	}
8069 
8070 fff_finish:
8071 	if (__improbable(pass1_updated != pass2_updated)) {
8072 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8073 		    __func__, pass1_updated, pass2_updated);
8074 	}
8075 
8076 	/*
8077 	 * If we are using the same approach for ref and mod
8078 	 * faults on this PTE, do not clear the write fault;
8079 	 * this would cause both ref and mod to be set on the
8080 	 * page again, and prevent us from taking ANY read/write
8081 	 * fault on the mapping.
8082 	 */
8083 	if (clear_write_fault && !ref_aliases_mod) {
8084 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8085 	}
8086 	if (tlb_flush_needed) {
8087 		if (flush_range) {
8088 			/* Delayed flush. Signal to the caller that the flush is needed. */
8089 			flush_range->ptfr_flush_needed = true;
8090 		} else {
8091 			sync_tlb_flush();
8092 		}
8093 	}
8094 
8095 	/* update global "reusable" status for this page */
8096 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8097 		ppattr_clear_reusable(pai);
8098 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8099 		ppattr_set_reusable(pai);
8100 	}
8101 
8102 	if (mod_fault) {
8103 		ppattr_set_modfault(pai);
8104 	}
8105 	if (ref_fault) {
8106 		ppattr_set_reffault(pai);
8107 	}
8108 	if (__probable(mustsynch)) {
8109 		pvh_unlock(pai);
8110 	}
8111 	return result;
8112 }
8113 
8114 MARK_AS_PMAP_TEXT boolean_t
8115 arm_force_fast_fault_internal(
8116 	ppnum_t         ppnum,
8117 	vm_prot_t       allow_mode,
8118 	int             options)
8119 {
8120 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8121 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8122 	}
8123 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8124 }
8125 
8126 /*
8127  *	Routine:	arm_force_fast_fault
8128  *
8129  *	Function:
8130  *		Force all mappings for this page to fault according
8131  *		to the access modes allowed, so we can gather ref/modify
8132  *		bits again.
8133  */
8134 
8135 boolean_t
8136 arm_force_fast_fault(
8137 	ppnum_t         ppnum,
8138 	vm_prot_t       allow_mode,
8139 	int             options,
8140 	__unused void   *arg)
8141 {
8142 	pmap_paddr_t    phys = ptoa(ppnum);
8143 
8144 	assert(ppnum != vm_page_fictitious_addr);
8145 
8146 	if (!pa_valid(phys)) {
8147 		return FALSE;   /* Not a managed page. */
8148 	}
8149 
8150 #if XNU_MONITOR
8151 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8152 #else
8153 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8154 #endif
8155 }
8156 
8157 /*
8158  *	Routine:	arm_clear_fast_fault
8159  *
8160  *	Function:
8161  *		Clear pending force fault for all mappings for this page based on
8162  *		the observed fault type, update ref/modify bits.
8163  */
8164 MARK_AS_PMAP_TEXT static boolean_t
8165 arm_clear_fast_fault(
8166 	ppnum_t ppnum,
8167 	vm_prot_t fault_type,
8168 	pt_entry_t *pte_p)
8169 {
8170 	pmap_paddr_t    pa = ptoa(ppnum);
8171 	pv_entry_t     *pve_p;
8172 	unsigned int    pai;
8173 	boolean_t       result;
8174 	bool            tlb_flush_needed = false;
8175 	pv_entry_t    **pv_h;
8176 	unsigned int    npve = 0;
8177 	unsigned int    pass1_updated = 0;
8178 	unsigned int    pass2_updated = 0;
8179 
8180 	assert(ppnum != vm_page_fictitious_addr);
8181 
8182 	if (!pa_valid(pa)) {
8183 		return FALSE;   /* Not a managed page. */
8184 	}
8185 
8186 	result = FALSE;
8187 	pai = pa_index(pa);
8188 	pvh_assert_locked(pai);
8189 	pv_h = pai_to_pvh(pai);
8190 
8191 	pve_p = PV_ENTRY_NULL;
8192 	if (pte_p == PT_ENTRY_NULL) {
8193 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8194 			pte_p = pvh_ptep(pv_h);
8195 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8196 			pve_p = pvh_pve_list(pv_h);
8197 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8198 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8199 		}
8200 	}
8201 
8202 	pv_entry_t *orig_pve_p = pve_p;
8203 	pt_entry_t *orig_pte_p = pte_p;
8204 	int pve_ptep_idx = 0;
8205 
8206 	/*
8207 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8208 	 * TLB invalidation in pass 2.
8209 	 */
8210 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8211 		pt_entry_t spte;
8212 		pt_entry_t tmplate;
8213 
8214 		if (pve_p != PV_ENTRY_NULL) {
8215 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8216 			if (pte_p == PT_ENTRY_NULL) {
8217 				goto cff_skip_pve_pass1;
8218 			}
8219 		}
8220 
8221 #ifdef PVH_FLAG_IOMMU
8222 		if (pvh_ptep_is_iommu(pte_p)) {
8223 			goto cff_skip_pve_pass1;
8224 		}
8225 #endif
8226 		if (*pte_p == ARM_PTE_EMPTY) {
8227 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8228 		}
8229 
8230 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8231 		const pmap_t pmap = ptdp->pmap;
8232 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8233 
8234 		assert(va >= pmap->min && va < pmap->max);
8235 
8236 		spte = *pte_p;
8237 		tmplate = spte;
8238 
8239 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8240 			{
8241 				if (pmap == kernel_pmap) {
8242 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8243 				} else {
8244 					assert(pmap->type != PMAP_TYPE_NESTED);
8245 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8246 				}
8247 			}
8248 
8249 			tmplate |= ARM_PTE_AF;
8250 
8251 			pte_set_was_writeable(tmplate, false);
8252 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8253 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8254 			tmplate = spte | ARM_PTE_AF;
8255 
8256 			{
8257 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8258 			}
8259 		}
8260 
8261 #if MACH_ASSERT && XNU_MONITOR
8262 		if (is_pte_xprr_protected(pmap, spte)) {
8263 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8264 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8265 				    "ppnum=0x%x, fault_type=0x%x",
8266 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8267 				    ppnum, fault_type);
8268 			}
8269 		}
8270 #endif /* MACH_ASSERT && XNU_MONITOR */
8271 
8272 		assert(spte != ARM_PTE_TYPE_FAULT);
8273 		if (spte != tmplate) {
8274 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8275 #ifdef ARM_PTE_FF_MARKER
8276 				assert(!(spte & ARM_PTE_FF_MARKER));
8277 				tmplate |= ARM_PTE_FF_MARKER;
8278 				++pass1_updated;
8279 #endif
8280 				tlb_flush_needed = true;
8281 			}
8282 			write_pte_fast(pte_p, tmplate);
8283 			result = TRUE;
8284 		}
8285 
8286 cff_skip_pve_pass1:
8287 		pte_p = PT_ENTRY_NULL;
8288 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8289 			pve_ptep_idx = 0;
8290 			pve_p = pve_next(pve_p);
8291 			++npve;
8292 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8293 				break;
8294 			}
8295 		}
8296 	}
8297 
8298 	if (!tlb_flush_needed) {
8299 		goto cff_finish;
8300 	}
8301 
8302 	FLUSH_PTE_STRONG();
8303 
8304 	/* Pass 2: Issue any required TLB invalidations */
8305 	pve_p = orig_pve_p;
8306 	pte_p = orig_pte_p;
8307 	pve_ptep_idx = 0;
8308 	npve = 0;
8309 
8310 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8311 		if (pve_p != PV_ENTRY_NULL) {
8312 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8313 			if (pte_p == PT_ENTRY_NULL) {
8314 				goto cff_skip_pve_pass2;
8315 			}
8316 		}
8317 
8318 #ifdef PVH_FLAG_IOMMU
8319 		if (pvh_ptep_is_iommu(pte_p)) {
8320 			goto cff_skip_pve_pass2;
8321 		}
8322 #endif
8323 
8324 #ifdef ARM_PTE_FF_MARKER
8325 		pt_entry_t spte = *pte_p;
8326 
8327 		if (!(spte & ARM_PTE_FF_MARKER)) {
8328 			goto cff_skip_pve_pass2;
8329 		} else {
8330 			spte &= (~ARM_PTE_FF_MARKER);
8331 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8332 			write_pte_fast(pte_p, spte);
8333 			++pass2_updated;
8334 		}
8335 #endif
8336 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8337 		const pmap_t pmap = ptdp->pmap;
8338 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8339 
8340 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8341 		    pmap, true, false);
8342 
8343 cff_skip_pve_pass2:
8344 		pte_p = PT_ENTRY_NULL;
8345 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8346 			pve_ptep_idx = 0;
8347 			pve_p = pve_next(pve_p);
8348 			++npve;
8349 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8350 				break;
8351 			}
8352 		}
8353 	}
8354 
8355 cff_finish:
8356 	if (__improbable(pass1_updated != pass2_updated)) {
8357 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8358 		    __func__, pass1_updated, pass2_updated);
8359 	}
8360 	if (tlb_flush_needed) {
8361 		sync_tlb_flush();
8362 	}
8363 	return result;
8364 }
8365 
8366 /*
8367  * Determine if the fault was induced by software tracking of
8368  * modify/reference bits.  If so, re-enable the mapping (and set
8369  * the appropriate bits).
8370  *
8371  * Returns KERN_SUCCESS if the fault was induced and was
8372  * successfully handled.
8373  *
8374  * Returns KERN_FAILURE if the fault was not induced and
8375  * the function was unable to deal with it.
8376  *
8377  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8378  * disallows this type of access.
8379  *
8380  * Returns KERN_ABORTED if the pmap lock is taken and a
8381  * preemption is pending.
8382  *
8383  */
8384 MARK_AS_PMAP_TEXT kern_return_t
8385 arm_fast_fault_internal(
8386 	pmap_t pmap,
8387 	vm_map_address_t va,
8388 	vm_prot_t fault_type,
8389 	__unused bool was_af_fault,
8390 	__unused bool from_user)
8391 {
8392 	kern_return_t   result = KERN_FAILURE;
8393 	pt_entry_t     *ptep;
8394 	pt_entry_t      spte = ARM_PTE_TYPE_FAULT;
8395 	unsigned int    pai;
8396 	pmap_paddr_t    pa;
8397 	validate_pmap_mutable(pmap);
8398 
8399 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8400 		return KERN_ABORTED;
8401 	}
8402 
8403 	/*
8404 	 * If the entry doesn't exist, is completely invalid, or is already
8405 	 * valid, we can't fix it here.
8406 	 */
8407 
8408 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8409 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8410 	if (ptep != PT_ENTRY_NULL) {
8411 		while (true) {
8412 			spte = *((volatile pt_entry_t*)ptep);
8413 
8414 			pa = pte_to_pa(spte);
8415 
8416 			if ((spte == ARM_PTE_TYPE_FAULT) ||
8417 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8418 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8419 				return result;
8420 			}
8421 
8422 			if (!pa_valid(pa)) {
8423 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8424 #if XNU_MONITOR
8425 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8426 					return KERN_PROTECTION_FAILURE;
8427 				} else
8428 #endif
8429 				return result;
8430 			}
8431 			pai = pa_index(pa);
8432 			pvh_lock(pai);
8433 			if (*ptep == spte) {
8434 				/*
8435 				 * Double-check the spte value, as we care about the AF bit.
8436 				 * It's also possible that pmap_page_protect() transitioned the
8437 				 * PTE to compressed/empty before we grabbed the PVH lock.
8438 				 */
8439 				break;
8440 			}
8441 			pvh_unlock(pai);
8442 		}
8443 	} else {
8444 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8445 		return result;
8446 	}
8447 
8448 
8449 	if ((result != KERN_SUCCESS) &&
8450 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8451 		/*
8452 		 * An attempted access will always clear ref/mod fault state, as
8453 		 * appropriate for the fault type.  arm_clear_fast_fault will
8454 		 * update the associated PTEs for the page as appropriate; if
8455 		 * any PTEs are updated, we redrive the access.  If the mapping
8456 		 * does not actually allow for the attempted access, the
8457 		 * following fault will (hopefully) fail to update any PTEs, and
8458 		 * thus cause arm_fast_fault to decide that it failed to handle
8459 		 * the fault.
8460 		 */
8461 		if (ppattr_test_reffault(pai)) {
8462 			ppattr_clear_reffault(pai);
8463 		}
8464 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8465 			ppattr_clear_modfault(pai);
8466 		}
8467 
8468 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8469 			/*
8470 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8471 			 * cost of not doing so is a another fault in a case
8472 			 * that should already result in an exception.
8473 			 */
8474 			result = KERN_SUCCESS;
8475 		}
8476 	}
8477 
8478 	/*
8479 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8480 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8481 	 * on mappings of the same page
8482 	 */
8483 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8484 		uintptr_t ap_ro, ap_rw, ap_x;
8485 		if (pmap == kernel_pmap) {
8486 			ap_ro = ARM_PTE_AP(AP_RONA);
8487 			ap_rw = ARM_PTE_AP(AP_RWNA);
8488 			ap_x = ARM_PTE_NX;
8489 		} else {
8490 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8491 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8492 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8493 		}
8494 		/*
8495 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8496 		 * hardware they may be xPRR-protected, in which case they'll be handled
8497 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8498 		 * handling path currently does not call arm_fast_fault() without at least
8499 		 * VM_PROT_READ in fault_type.
8500 		 */
8501 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8502 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8503 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8504 				result = KERN_SUCCESS;
8505 			}
8506 		}
8507 	}
8508 
8509 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8510 		/*
8511 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8512 		 * another pending PV list operation or an excessively large PV list.
8513 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8514 		 * taking a fault on the same mapping.
8515 		 */
8516 		result = KERN_SUCCESS;
8517 	}
8518 
8519 	pvh_unlock(pai);
8520 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8521 	return result;
8522 }
8523 
8524 kern_return_t
8525 arm_fast_fault(
8526 	pmap_t pmap,
8527 	vm_map_address_t va,
8528 	vm_prot_t fault_type,
8529 	bool was_af_fault,
8530 	__unused bool from_user)
8531 {
8532 	kern_return_t   result = KERN_FAILURE;
8533 
8534 	if (va < pmap->min || va >= pmap->max) {
8535 		return result;
8536 	}
8537 
8538 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8539 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8540 	    from_user);
8541 
8542 	do {
8543 #if XNU_MONITOR
8544 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8545 #else
8546 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8547 #endif
8548 	} while (result == KERN_ABORTED);
8549 
8550 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8551 
8552 	return result;
8553 }
8554 
8555 void
8556 pmap_copy_page(
8557 	ppnum_t psrc,
8558 	ppnum_t pdst)
8559 {
8560 	bcopy_phys((addr64_t) (ptoa(psrc)),
8561 	    (addr64_t) (ptoa(pdst)),
8562 	    PAGE_SIZE);
8563 }
8564 
8565 
8566 /*
8567  *	pmap_copy_page copies the specified (machine independent) pages.
8568  */
8569 void
8570 pmap_copy_part_page(
8571 	ppnum_t psrc,
8572 	vm_offset_t src_offset,
8573 	ppnum_t pdst,
8574 	vm_offset_t dst_offset,
8575 	vm_size_t len)
8576 {
8577 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8578 	    (addr64_t) (ptoa(pdst) + dst_offset),
8579 	    len);
8580 }
8581 
8582 
8583 /*
8584  *	pmap_zero_page zeros the specified (machine independent) page.
8585  */
8586 void
8587 pmap_zero_page(
8588 	ppnum_t pn)
8589 {
8590 	assert(pn != vm_page_fictitious_addr);
8591 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8592 }
8593 
8594 /*
8595  *	pmap_zero_part_page
8596  *	zeros the specified (machine independent) part of a page.
8597  */
8598 void
8599 pmap_zero_part_page(
8600 	ppnum_t pn,
8601 	vm_offset_t offset,
8602 	vm_size_t len)
8603 {
8604 	assert(pn != vm_page_fictitious_addr);
8605 	assert(offset + len <= PAGE_SIZE);
8606 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8607 }
8608 
8609 void
8610 pmap_map_globals(
8611 	void)
8612 {
8613 	pt_entry_t      *ptep, pte;
8614 
8615 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8616 	assert(ptep != PT_ENTRY_NULL);
8617 	assert(*ptep == ARM_PTE_EMPTY);
8618 
8619 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8620 #if __ARM_KERNEL_PROTECT__
8621 	pte |= ARM_PTE_NG;
8622 #endif /* __ARM_KERNEL_PROTECT__ */
8623 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8624 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8625 	*ptep = pte;
8626 	FLUSH_PTE();
8627 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8628 
8629 #if KASAN
8630 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8631 #endif
8632 }
8633 
8634 vm_offset_t
8635 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8636 {
8637 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8638 		panic("%s: invalid index %u", __func__, index);
8639 	}
8640 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8641 }
8642 
8643 MARK_AS_PMAP_TEXT unsigned int
8644 pmap_map_cpu_windows_copy_internal(
8645 	ppnum_t pn,
8646 	vm_prot_t prot,
8647 	unsigned int wimg_bits)
8648 {
8649 	pt_entry_t      *ptep = NULL, pte;
8650 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8651 	unsigned int    cpu_num;
8652 	unsigned int    i;
8653 	vm_offset_t     cpu_copywindow_vaddr = 0;
8654 	bool            need_strong_sync = false;
8655 
8656 #if XNU_MONITOR
8657 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8658 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8659 #endif
8660 
8661 #if XNU_MONITOR
8662 #ifdef  __ARM_COHERENT_IO__
8663 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8664 		panic("%s: attempted to map a managed page, "
8665 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8666 		    __FUNCTION__,
8667 		    pn, prot, wimg_bits);
8668 	}
8669 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8670 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8671 	}
8672 
8673 #else /* __ARM_COHERENT_IO__ */
8674 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8675 #endif /* __ARM_COHERENT_IO__ */
8676 #endif /* XNU_MONITOR */
8677 	cpu_num = pmap_cpu_data->cpu_number;
8678 
8679 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8680 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8681 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8682 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8683 		if (*ptep == ARM_PTE_TYPE_FAULT) {
8684 			break;
8685 		}
8686 	}
8687 	if (i == CPUWINDOWS_MAX) {
8688 		panic("pmap_map_cpu_windows_copy: out of window");
8689 	}
8690 
8691 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8692 #if __ARM_KERNEL_PROTECT__
8693 	pte |= ARM_PTE_NG;
8694 #endif /* __ARM_KERNEL_PROTECT__ */
8695 
8696 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8697 
8698 	if (prot & VM_PROT_WRITE) {
8699 		pte |= ARM_PTE_AP(AP_RWNA);
8700 	} else {
8701 		pte |= ARM_PTE_AP(AP_RONA);
8702 	}
8703 #if HAS_FEAT_XS
8704 	need_strong_sync = pte_is_xs(native_pt_attr, pte);
8705 #endif
8706 	write_pte_fast(ptep, pte);
8707 	/*
8708 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8709 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8710 	 */
8711 	FLUSH_PTE_STRONG();
8712 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8713 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8714 
8715 	return i;
8716 }
8717 
8718 unsigned int
8719 pmap_map_cpu_windows_copy(
8720 	ppnum_t pn,
8721 	vm_prot_t prot,
8722 	unsigned int wimg_bits)
8723 {
8724 #if XNU_MONITOR
8725 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8726 #else
8727 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8728 #endif
8729 }
8730 
8731 MARK_AS_PMAP_TEXT void
8732 pmap_unmap_cpu_windows_copy_internal(
8733 	unsigned int index)
8734 {
8735 	pt_entry_t      *ptep;
8736 	unsigned int    cpu_num;
8737 	vm_offset_t     cpu_copywindow_vaddr = 0;
8738 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8739 
8740 	cpu_num = pmap_cpu_data->cpu_number;
8741 
8742 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8743 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8744 	 * (which are likely to have been on I/O memory) are complete before
8745 	 * tearing down the mapping. */
8746 	__builtin_arm_dsb(DSB_SY);
8747 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8748 	write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8749 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8750 }
8751 
8752 void
8753 pmap_unmap_cpu_windows_copy(
8754 	unsigned int index)
8755 {
8756 #if XNU_MONITOR
8757 	return pmap_unmap_cpu_windows_copy_ppl(index);
8758 #else
8759 	return pmap_unmap_cpu_windows_copy_internal(index);
8760 #endif
8761 }
8762 
8763 #if XNU_MONITOR
8764 
8765 MARK_AS_PMAP_TEXT void
8766 pmap_invoke_with_page(
8767 	ppnum_t page_number,
8768 	void *ctx,
8769 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8770 {
8771 	#pragma unused(page_number, ctx, callback)
8772 }
8773 
8774 /*
8775  * Loop over every pmap_io_range (I/O ranges marked as owned by
8776  * the PPL in the device tree) and conditionally call callback() on each range
8777  * that needs to be included in the hibernation image.
8778  *
8779  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8780  *                 context is needed in the callback.
8781  * @param callback Callback function invoked on each range (gated by flag).
8782  */
8783 MARK_AS_PMAP_TEXT void
8784 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8785 {
8786 	extern const pmap_io_range_t* io_attr_table;
8787 	extern const unsigned int num_io_rgns;
8788 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8789 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8790 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8791 		}
8792 	}
8793 }
8794 
8795 /**
8796  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8797  * PPL-owned page. Otherwise, do nothing.
8798  *
8799  * @param addr Physical address of the page to set the HASHED flag on.
8800  */
8801 MARK_AS_PMAP_TEXT void
8802 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8803 {
8804 	/* Ignore non-managed kernel memory. */
8805 	if (!pa_valid(addr)) {
8806 		return;
8807 	}
8808 
8809 	const unsigned int pai = pa_index(addr);
8810 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8811 		pv_entry_t **pv_h = pai_to_pvh(pai);
8812 
8813 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
8814 		pvh_lock(pai);
8815 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8816 		pvh_unlock(pai);
8817 	}
8818 }
8819 
8820 /**
8821  * Loop through every physical page in the system and clear out the HASHED flag
8822  * on every PPL-owned page. That flag is used to keep track of which pages have
8823  * been hashed into the hibernation image during the hibernation entry process.
8824  *
8825  * The HASHED flag needs to be cleared out between hibernation cycles because the
8826  * pv_head_table and pp_attr_table's might have been copied into the hibernation
8827  * image with the HASHED flag set on certain pages. It's important to clear the
8828  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8829  * into the hibernation image can't be compromised across hibernation cycles.
8830  */
8831 MARK_AS_PMAP_TEXT void
8832 pmap_clear_ppl_hashed_flag_all(void)
8833 {
8834 	const unsigned int last_index = pa_index(vm_last_phys);
8835 	pv_entry_t **pv_h = NULL;
8836 
8837 	for (int pai = 0; pai < last_index; ++pai) {
8838 		pv_h = pai_to_pvh(pai);
8839 
8840 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8841 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8842 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8843 			pvh_lock(pai);
8844 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8845 			pvh_unlock(pai);
8846 		}
8847 	}
8848 }
8849 
8850 /**
8851  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8852  * ppl_hib driver will call this after all wired pages have been copied into the
8853  * hibernation image.
8854  */
8855 MARK_AS_PMAP_TEXT void
8856 pmap_check_ppl_hashed_flag_all(void)
8857 {
8858 	const unsigned int last_index = pa_index(vm_last_phys);
8859 	pv_entry_t **pv_h = NULL;
8860 
8861 	for (int pai = 0; pai < last_index; ++pai) {
8862 		pv_h = pai_to_pvh(pai);
8863 
8864 		/**
8865 		 * The PMAP stacks are explicitly not saved into the image so skip checking
8866 		 * the pages that contain the PMAP stacks.
8867 		 */
8868 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8869 		    (pai < pa_index(pmap_stacks_end_pa));
8870 
8871 		if (!is_pmap_stack &&
8872 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8873 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8874 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8875 		}
8876 	}
8877 }
8878 
8879 #endif /* XNU_MONITOR */
8880 
8881 /*
8882  * Indicate that a pmap is intended to be used as a nested pmap
8883  * within one or more larger address spaces.  This must be set
8884  * before pmap_nest() is called with this pmap as the 'subordinate'.
8885  */
8886 MARK_AS_PMAP_TEXT void
8887 pmap_set_nested_internal(
8888 	pmap_t pmap)
8889 {
8890 	validate_pmap_mutable(pmap);
8891 	if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
8892 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8893 		    __func__, pmap, pmap->type);
8894 	}
8895 
8896 #if XNU_MONITOR
8897 	/**
8898 	 * The "seq_cst" ordering of the atomic load here guarantees
8899 	 * the check below is performed after the type update above
8900 	 * is observed. Together with similar order guarantee at
8901 	 * pmap_switch_internal(), it makes sure a pmap is never
8902 	 * active-and-nested:
8903 	 *
8904 	 * pmap_set_nested() | pmap_switch()
8905 	 * --------------------------------------
8906 	 * set nested        | set active
8907 	 * store-load barrier| store-load barrier
8908 	 * assert !active    | assert !nested
8909 	 */
8910 	const int max_cpu = ml_get_max_cpu_number();
8911 	for (unsigned int i = 0; i <= max_cpu; ++i) {
8912 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
8913 		if (cpu_data == NULL) {
8914 			continue;
8915 		}
8916 		if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
8917 			panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
8918 		}
8919 	}
8920 #endif /* XNU_MONITOR */
8921 
8922 	/**
8923 	 * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give
8924 	 * this pmap its own nested pmap.
8925 	 */
8926 	if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
8927 		panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
8928 	}
8929 
8930 	pmap_get_pt_ops(pmap)->free_id(pmap);
8931 }
8932 
8933 void
8934 pmap_set_nested(
8935 	pmap_t pmap)
8936 {
8937 #if XNU_MONITOR
8938 	pmap_set_nested_ppl(pmap);
8939 #else
8940 	pmap_set_nested_internal(pmap);
8941 #endif
8942 }
8943 
8944 bool
8945 pmap_is_nested(
8946 	pmap_t pmap)
8947 {
8948 	return pmap->type == PMAP_TYPE_NESTED;
8949 }
8950 
8951 /*
8952  * pmap_trim_range(pmap, start, end)
8953  *
8954  * pmap  = pmap to operate on
8955  * start = start of the range
8956  * end   = end of the range
8957  *
8958  * Attempts to deallocate TTEs for the given range in the nested range.
8959  */
8960 MARK_AS_PMAP_TEXT static void
8961 pmap_trim_range(
8962 	pmap_t pmap,
8963 	addr64_t start,
8964 	addr64_t end)
8965 {
8966 	addr64_t cur;
8967 	addr64_t nested_region_start;
8968 	addr64_t nested_region_end;
8969 	addr64_t adjusted_start;
8970 	addr64_t adjusted_end;
8971 	addr64_t adjust_offmask;
8972 	tt_entry_t * tte_p;
8973 	pt_entry_t * pte_p;
8974 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8975 
8976 	if (__improbable(end < start)) {
8977 		panic("%s: invalid address range, "
8978 		    "pmap=%p, start=%p, end=%p",
8979 		    __func__,
8980 		    pmap, (void*)start, (void*)end);
8981 	}
8982 
8983 	nested_region_start = pmap->nested_region_addr;
8984 	nested_region_end = nested_region_start + pmap->nested_region_size;
8985 
8986 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8987 		panic("%s: range outside nested region %p-%p, "
8988 		    "pmap=%p, start=%p, end=%p",
8989 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
8990 		    pmap, (void*)start, (void*)end);
8991 	}
8992 
8993 	/* Contract the range to TT page boundaries. */
8994 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8995 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8996 	adjusted_end = end & ~adjust_offmask;
8997 
8998 	/* Iterate over the range, trying to remove TTEs. */
8999 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9000 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9001 
9002 		tte_p = pmap_tte(pmap, cur);
9003 
9004 		if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
9005 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
9006 
9007 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9008 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9009 				/* Deallocate for the nested map. */
9010 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9011 			} else if (pmap->type == PMAP_TYPE_USER) {
9012 				/**
9013 				 * Just remove for the parent map. If the leaf table pointed
9014 				 * to by the TTE being removed (owned by the nested pmap)
9015 				 * has any mappings, then this call will panic. This
9016 				 * enforces the policy that tables being trimmed must be
9017 				 * empty to prevent possible use-after-free attacks.
9018 				 */
9019 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9020 			} else {
9021 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9022 			}
9023 		} else {
9024 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9025 		}
9026 	}
9027 
9028 	/* Remove empty L2 TTs. */
9029 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9030 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9031 
9032 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9033 		/* For each L1 entry in our range... */
9034 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9035 
9036 		bool remove_tt1e = true;
9037 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9038 		tt_entry_t * tt2e_start;
9039 		tt_entry_t * tt2e_end;
9040 		tt_entry_t * tt2e_p;
9041 		tt_entry_t tt1e;
9042 
9043 		if (tt1e_p == NULL) {
9044 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9045 			continue;
9046 		}
9047 
9048 		tt1e = *tt1e_p;
9049 
9050 		if (tt1e == ARM_TTE_TYPE_FAULT) {
9051 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9052 			continue;
9053 		}
9054 
9055 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9056 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9057 
9058 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9059 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9060 				/*
9061 				 * If any TTEs are populated, don't remove the
9062 				 * L1 TT.
9063 				 */
9064 				remove_tt1e = false;
9065 			}
9066 		}
9067 
9068 		if (remove_tt1e) {
9069 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9070 		} else {
9071 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9072 		}
9073 	}
9074 }
9075 
9076 /**
9077  * State machine for multi-step pmap trimming. Trimming is the action of
9078  * deallocating the TTEs of the shared region of pmaps down to a given range.
9079  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9080  * disabling preemption for too long. These steps include computing the bounds
9081  * of the shared region, trimming the head of the "grand", trimming the tail of
9082  * the "grand", and trimming the "subord". Some of the steps can be skipped under
9083  * different conditions.
9084  *
9085  * @param grand the pmap in which the pages are nested
9086  * @param subord the pmap from which the pages are shared, or nested
9087  * @param vstart start of the used range in "grand"
9088  * @param size size of the used range
9089  * @param state the current state of the state machine
9090  *
9091  * @return the next state of the state machine, to be used in the next call
9092  *         into this function.
9093  */
9094 MARK_AS_PMAP_TEXT pmap_trim_state_t
9095 pmap_trim_internal(
9096 	pmap_t grand,
9097 	pmap_t subord,
9098 	addr64_t vstart,
9099 	uint64_t size,
9100 	pmap_trim_state_t state)
9101 {
9102 	/* Validation needs to be done regardless of state. */
9103 	addr64_t vend;
9104 
9105 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9106 		panic("%s: grand addr wraps around, "
9107 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9108 		    __func__, grand, subord, (void*)vstart, size, state);
9109 	}
9110 
9111 	validate_pmap_mutable(grand);
9112 	validate_pmap(subord);
9113 
9114 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9115 		panic("%s: subord is of non-nestable type 0x%hhx, "
9116 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9117 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
9118 	}
9119 
9120 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9121 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9122 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9123 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
9124 	}
9125 
9126 	if (__improbable(grand->nested_pmap != subord)) {
9127 		panic("%s: grand->nested != subord, "
9128 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9129 		    __func__, grand, subord, (void*)vstart, size, state);
9130 	}
9131 
9132 	if (__improbable((size != 0) &&
9133 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9134 		panic("%s: grand range not in nested region, "
9135 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9136 		    __func__, grand, subord, (void*)vstart, size, state);
9137 	}
9138 
9139 	/* Trimming starts with figuring out the bounds for the grand. */
9140 	if (state == PMAP_TRIM_STATE_START) {
9141 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9142 
9143 		/**
9144 		 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9145 		 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9146 		 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9147 		 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9148 		 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9149 		 * PMAP_TRIM_STATE_DONE.
9150 		 */
9151 		if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9152 			assert(subord->nested_bounds_set);
9153 
9154 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9155 			if (!grand->nested_bounds_set) {
9156 				/* Inherit the bounds from subord. */
9157 				grand->nested_region_true_start = subord->nested_region_true_start;
9158 				grand->nested_region_true_end = subord->nested_region_true_end;
9159 				grand->nested_bounds_set = true;
9160 			}
9161 
9162 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9163 
9164 			/* Now that the grand has bounds, we are done. */
9165 			return PMAP_TRIM_STATE_DONE;
9166 		}
9167 
9168 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9169 		if ((!subord->nested_bounds_set) && size) {
9170 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9171 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9172 
9173 			subord->nested_region_true_start = vstart;
9174 			subord->nested_region_true_end = vend;
9175 			subord->nested_region_true_start &= ~adjust_offmask;
9176 
9177 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9178 				panic("%s: padded true end wraps around, "
9179 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9180 				    __func__, grand, subord, (void*)vstart, size, state);
9181 			}
9182 
9183 			subord->nested_region_true_end &= ~adjust_offmask;
9184 			subord->nested_bounds_set = true;
9185 		}
9186 
9187 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9188 		if (subord->nested_bounds_set) {
9189 			/* Inherit the bounds from subord. */
9190 			grand->nested_region_true_start = subord->nested_region_true_start;
9191 			grand->nested_region_true_end = subord->nested_region_true_end;
9192 			grand->nested_bounds_set = true;
9193 
9194 			/* If we know the bounds, we can trim the pmap. */
9195 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9196 
9197 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
9198 		} else {
9199 			/* Don't trim if we don't know the bounds. */
9200 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9201 
9202 			return PMAP_TRIM_STATE_DONE;
9203 		}
9204 	}
9205 
9206 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
9207 	if (!grand->nested_bounds_set) {
9208 		panic("%s: !grand->nested_bounds_set, "
9209 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9210 		    __func__, grand, subord, (void*)vstart, size, state);
9211 	}
9212 
9213 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9214 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9215 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9216 		    NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9217 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9218 			    (unsigned int)grand->nested_no_bounds_ref_state);
9219 		}
9220 
9221 #if XNU_MONITOR
9222 		if (pmap_pending_preemption()) {
9223 			return PMAP_TRIM_STATE_GRAND_AFTER;
9224 		}
9225 #endif
9226 
9227 		state = PMAP_TRIM_STATE_GRAND_AFTER;
9228 	}
9229 
9230 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9231 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9232 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9233 		    NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9234 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9235 			    (unsigned int)grand->nested_no_bounds_ref_state);
9236 		}
9237 
9238 #if XNU_MONITOR
9239 		if (pmap_pending_preemption()) {
9240 			return PMAP_TRIM_STATE_SUBORD;
9241 		}
9242 #endif
9243 
9244 		state = PMAP_TRIM_STATE_SUBORD;
9245 	}
9246 
9247 	/* START state is guaranteed to compute the bounds for the subord. */
9248 	if (!subord->nested_bounds_set) {
9249 		panic("%s: !subord->nested_bounds_set, "
9250 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9251 		    __func__, grand, subord, (void*)vstart, size, state);
9252 	}
9253 
9254 	if (state == PMAP_TRIM_STATE_SUBORD) {
9255 		/**
9256 		 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9257 		 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9258 		 * called once grand's nested tables have been fully trimmed, and can only be called once
9259 		 * for each 'grand' pmap.  We use release ordering for the atomics above to ensure that
9260 		 * the state update is visible only once the preceding trim operation is complete.  An
9261 		 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9262 		 * but locking within pmap_trim_range() should make that harmless (and all but one will
9263 		 * ultimately panic due to a failed atomic state CAS).  We use acquire ordering here to
9264 		 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9265 		 * of the state CAS.
9266 		 */
9267 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9268 		    NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9269 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9270 			    (unsigned int)grand->nested_no_bounds_ref_state);
9271 		}
9272 		pmap_trim_subord(subord);
9273 	}
9274 
9275 	return PMAP_TRIM_STATE_DONE;
9276 }
9277 
9278 MARK_AS_PMAP_TEXT static void
9279 pmap_trim_self(pmap_t pmap)
9280 {
9281 	if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9282 		/* If we have a no bounds ref, we need to drop it. */
9283 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9284 		pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9285 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9286 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9287 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9288 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9289 
9290 		if (nested_bounds_set) {
9291 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9292 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9293 		}
9294 		/*
9295 		 * Try trimming the nested pmap, in case we had the
9296 		 * last reference.
9297 		 */
9298 		pmap_trim_subord(pmap->nested_pmap);
9299 	}
9300 }
9301 
9302 /*
9303  * pmap_trim_subord(grand, subord)
9304  *
9305  * grand  = pmap that we have nested subord in
9306  * subord = nested pmap we are attempting to trim
9307  *
9308  * Trims subord if possible
9309  */
9310 MARK_AS_PMAP_TEXT static void
9311 pmap_trim_subord(pmap_t subord)
9312 {
9313 	bool contract_subord = false;
9314 
9315 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9316 
9317 	subord->nested_no_bounds_refcnt--;
9318 
9319 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9320 		/* If this was the last no bounds reference, trim subord. */
9321 		contract_subord = true;
9322 	}
9323 
9324 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9325 
9326 	if (contract_subord) {
9327 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9328 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9329 	}
9330 }
9331 
9332 /**
9333  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9334  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9335  * disabling preemption for too long.
9336  *
9337  * @note When we load the shared region we always create pages tables for the
9338  *       entire region. In practice, the shared cache may use just a portion
9339  *       of that. Before we know the bounds of the shared region, it can
9340  *       already be mapped into processes. Therefore, once the bounds are
9341  *       known, "trimming" comes in handy to remove the unnecessary page
9342  *       tables in the processes the shared region is mapped in, and eventually
9343  *       those in the shared region itself. Note that the shared region must
9344  *       be trimmed after the user processes because it has the L3 entries
9345  *       everyone else is pointing to.
9346  *
9347  * @param grand the pmap in which the pages are nested
9348  * @param subord the pmap from which the pages are shared, or nested
9349  * @param vstart start of the used range in "grand"
9350  * @param size size of the used range
9351  */
9352 void
9353 pmap_trim(
9354 	pmap_t grand,
9355 	pmap_t subord,
9356 	addr64_t vstart,
9357 	uint64_t size)
9358 {
9359 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9360 
9361 #if XNU_MONITOR
9362 	/* On PPL systems, drives the state machine until its done. */
9363 	while (state != PMAP_TRIM_STATE_DONE) {
9364 		__assert_only pmap_trim_state_t old_state = state;
9365 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9366 
9367 		/* Are we making progress? */
9368 		assert(old_state != state);
9369 	}
9370 
9371 	pmap_ledger_check_balance(grand);
9372 	pmap_ledger_check_balance(subord);
9373 #else
9374 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9375 
9376 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9377 	assert(state == PMAP_TRIM_STATE_DONE);
9378 #endif
9379 }
9380 
9381 #if HAS_APPLE_PAC
9382 void *
9383 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9384 {
9385 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9386 		panic("attempt to sign user pointer without process independent key");
9387 	}
9388 
9389 	void *res = NULL;
9390 	uint64_t current_intr_state = pmap_interrupts_disable();
9391 
9392 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9393 
9394 	__compiler_materialize_and_prevent_reordering_on(value);
9395 	switch (key) {
9396 	case ptrauth_key_asia:
9397 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9398 		break;
9399 	case ptrauth_key_asda:
9400 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9401 		break;
9402 	default:
9403 		__builtin_unreachable();
9404 	}
9405 	__compiler_materialize_and_prevent_reordering_on(res);
9406 
9407 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9408 
9409 	pmap_interrupts_restore(current_intr_state);
9410 
9411 	return res;
9412 }
9413 
9414 void *
9415 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9416 {
9417 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9418 }
9419 
9420 void *
9421 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9422 {
9423 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9424 		panic("attempt to auth user pointer without process independent key");
9425 	}
9426 
9427 	void *res = NULL;
9428 	uint64_t current_intr_state = pmap_interrupts_disable();
9429 
9430 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9431 	__compiler_materialize_and_prevent_reordering_on(value);
9432 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9433 	__compiler_materialize_and_prevent_reordering_on(res);
9434 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9435 
9436 	pmap_interrupts_restore(current_intr_state);
9437 
9438 	return res;
9439 }
9440 
9441 void *
9442 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9443 {
9444 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9445 }
9446 #endif /* HAS_APPLE_PAC */
9447 
9448 /*
9449  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9450  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9451  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9452  * return value, to indicate where a preempted [un]nest operation should resume.
9453  * When the return value contains the ending address of the nested region with
9454  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9455  */
9456 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9457 
9458 /*
9459  *	kern_return_t pmap_nest(grand, subord, vstart, size)
9460  *
9461  *	grand  = the pmap that we will nest subord into
9462  *	subord = the pmap that goes into the grand
9463  *	vstart  = start of range in pmap to be inserted
9464  *	size   = Size of nest area (up to 16TB)
9465  *
9466  *	Inserts a pmap into another.  This is used to implement shared segments.
9467  *
9468  */
9469 
9470 /**
9471  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9472  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9473  * This function operates in 3 main phases:
9474  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9475  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9476  *    the mapping range are present in subord.
9477  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9478  *    contains pointers to subord's leaf-level pagetable pages for the specified
9479  *    VA range.
9480  *
9481  * This function may return early due to pending AST_URGENT preemption; if so
9482  * it will indicate the need to be re-entered.
9483  *
9484  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9485  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9486  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9487  * @param size twig-aligned size of the nesting range
9488  * @param vrestart the twig-aligned starting address of the current call.  May contain
9489  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9490  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9491  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9492  *
9493  * @return the virtual address at which to restart the operation, possibly including
9494  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9495  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9496  */
9497 MARK_AS_PMAP_TEXT vm_map_offset_t
9498 pmap_nest_internal(
9499 	pmap_t grand,
9500 	pmap_t subord,
9501 	addr64_t vstart,
9502 	uint64_t size,
9503 	vm_map_offset_t vrestart,
9504 	kern_return_t *krp)
9505 {
9506 	kern_return_t kr = KERN_FAILURE;
9507 	vm_map_offset_t vaddr;
9508 	tt_entry_t     *stte_p;
9509 	tt_entry_t     *gtte_p;
9510 	uint64_t        nested_region_unnested_table_bitmap_size;
9511 	unsigned int*   nested_region_unnested_table_bitmap = NULL;
9512 	uint64_t        new_nested_region_unnested_table_bitmap_size;
9513 	unsigned int*   new_nested_region_unnested_table_bitmap = NULL;
9514 	int             expand_options = 0;
9515 	bool            deref_subord = true;
9516 	bool            grand_locked = false;
9517 
9518 	addr64_t vend;
9519 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9520 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9521 	}
9522 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9523 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9524 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9525 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9526 	}
9527 
9528 	assert(krp != NULL);
9529 	validate_pmap_mutable(grand);
9530 	validate_pmap(subord);
9531 #if XNU_MONITOR
9532 	/*
9533 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9534 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9535 	 * be in the process of being destroyed.  If destruction is already committed,
9536 	 * then the check of ref_count below will cover us.  If destruction is initiated
9537 	 * during or after this call, then pmap_destroy() will catch the non-zero
9538 	 * nested_count.
9539 	 */
9540 	os_atomic_inc(&subord->nested_count, relaxed);
9541 	os_atomic_thread_fence(seq_cst);
9542 #endif
9543 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9544 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9545 	}
9546 
9547 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9548 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9549 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9550 	}
9551 
9552 #if XNU_MONITOR
9553 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9554 #endif
9555 
9556 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9557 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9558 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9559 		    grand, vstart, size, (unsigned long long)vrestart);
9560 	}
9561 
9562 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9563 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9564 	}
9565 
9566 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9567 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9568 	}
9569 
9570 	if (subord->nested_region_unnested_table_bitmap == NULL) {
9571 		nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9572 
9573 		if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9574 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9575 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9576 			    __func__, nested_region_unnested_table_bitmap_size,
9577 			    grand, subord, vstart, size);
9578 		}
9579 
9580 #if XNU_MONITOR
9581 		pmap_paddr_t pa = 0;
9582 
9583 		if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9584 			panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9585 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9586 			    __FUNCTION__, nested_region_unnested_table_bitmap_size,
9587 			    grand, subord, vstart, size);
9588 		}
9589 
9590 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9591 
9592 		if (kr != KERN_SUCCESS) {
9593 			goto nest_cleanup;
9594 		}
9595 
9596 		assert(pa);
9597 
9598 		nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9599 #else
9600 		nested_region_unnested_table_bitmap = kalloc_data(
9601 			nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9602 			Z_WAITOK | Z_ZERO);
9603 #endif
9604 
9605 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9606 			kr = KERN_ABORTED;
9607 			goto nest_cleanup;
9608 		}
9609 
9610 		if (subord->nested_region_unnested_table_bitmap == NULL) {
9611 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9612 			subord->nested_region_addr = vstart;
9613 			subord->nested_region_size = (mach_vm_offset_t) size;
9614 
9615 			/**
9616 			 * Ensure that the rest of the subord->nested_region_* fields are
9617 			 * initialized and visible before setting the nested_region_unnested_table_bitmap
9618 			 * field (which is used as the flag to say that the rest are initialized).
9619 			 */
9620 			__builtin_arm_dmb(DMB_ISHST);
9621 			subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap;
9622 			nested_region_unnested_table_bitmap = NULL;
9623 		}
9624 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9625 		if (nested_region_unnested_table_bitmap != NULL) {
9626 #if XNU_MONITOR
9627 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9628 #else
9629 			kfree_data(nested_region_unnested_table_bitmap,
9630 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9631 #endif
9632 			nested_region_unnested_table_bitmap = NULL;
9633 		}
9634 	}
9635 
9636 	/**
9637 	 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9638 	 * speculated before their initialization.
9639 	 */
9640 	__builtin_arm_dmb(DMB_ISHLD);
9641 
9642 	if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9643 		uint64_t        new_size;
9644 
9645 		nested_region_unnested_table_bitmap = NULL;
9646 		nested_region_unnested_table_bitmap_size = 0ULL;
9647 		new_size =  vend - subord->nested_region_addr;
9648 
9649 		new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9650 
9651 		if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9652 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9653 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9654 			    __func__, new_nested_region_unnested_table_bitmap_size,
9655 			    grand, subord, vstart, size);
9656 		}
9657 
9658 #if XNU_MONITOR
9659 		pmap_paddr_t pa = 0;
9660 
9661 		if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9662 			panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9663 			    "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9664 			    __FUNCTION__, new_nested_region_unnested_table_bitmap_size,
9665 			    grand, subord, vstart, new_size);
9666 		}
9667 
9668 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9669 
9670 		if (kr != KERN_SUCCESS) {
9671 			goto nest_cleanup;
9672 		}
9673 
9674 		assert(pa);
9675 
9676 		new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9677 #else
9678 		new_nested_region_unnested_table_bitmap = kalloc_data(
9679 			new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9680 			Z_WAITOK | Z_ZERO);
9681 #endif
9682 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9683 			kr = KERN_ABORTED;
9684 			goto nest_cleanup;
9685 		}
9686 
9687 		if (subord->nested_region_size < new_size) {
9688 			bcopy(subord->nested_region_unnested_table_bitmap,
9689 			    new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9690 			nested_region_unnested_table_bitmap_size  = subord->nested_region_unnested_table_bitmap_size;
9691 			nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap;
9692 			subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap;
9693 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size;
9694 			subord->nested_region_size = new_size;
9695 			new_nested_region_unnested_table_bitmap = NULL;
9696 		}
9697 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9698 		if (nested_region_unnested_table_bitmap != NULL) {
9699 #if XNU_MONITOR
9700 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9701 #else
9702 			kfree_data(nested_region_unnested_table_bitmap,
9703 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9704 #endif
9705 			nested_region_unnested_table_bitmap = NULL;
9706 		}
9707 		if (new_nested_region_unnested_table_bitmap != NULL) {
9708 #if XNU_MONITOR
9709 			pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9710 #else
9711 			kfree_data(new_nested_region_unnested_table_bitmap,
9712 			    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9713 #endif
9714 			new_nested_region_unnested_table_bitmap = NULL;
9715 		}
9716 	}
9717 
9718 	if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9719 		kr = KERN_ABORTED;
9720 		goto nest_cleanup;
9721 	}
9722 
9723 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) {
9724 		/**
9725 		 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9726 		 * into a nested pmap, which would then produce multiple levels of nesting.
9727 		 */
9728 		if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9729 			panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9730 		}
9731 		/*
9732 		 * If this is grand's first nesting operation, keep the reference on subord.
9733 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9734 		 */
9735 		deref_subord = false;
9736 
9737 		if (!subord->nested_bounds_set) {
9738 			/*
9739 			 * We are nesting without the shared regions bounds
9740 			 * being known.  We'll have to trim the pmap later.
9741 			 */
9742 			if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9743 			    NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9744 				panic("%s: grand %p already nested", __func__, grand);
9745 			}
9746 			subord->nested_no_bounds_refcnt++;
9747 		}
9748 
9749 		if (__improbable(vstart < subord->nested_region_addr ||
9750 		    vend > (subord->nested_region_addr + subord->nested_region_size))) {
9751 			panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9752 			    __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9753 			    (void *) (subord->nested_region_addr + subord->nested_region_size));
9754 		}
9755 
9756 		grand->nested_region_addr = vstart;
9757 		grand->nested_region_size = (mach_vm_offset_t) size;
9758 	} else {
9759 		if (__improbable(grand->nested_pmap != subord)) {
9760 			panic("pmap_nest() pmap %p has a nested pmap", grand);
9761 		} else if (__improbable(grand->nested_region_addr > vstart)) {
9762 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9763 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9764 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9765 		}
9766 	}
9767 
9768 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9769 	if (vaddr < subord->nested_region_true_start) {
9770 		vaddr = subord->nested_region_true_start;
9771 	}
9772 
9773 	addr64_t true_end = vend;
9774 	if (true_end > subord->nested_region_true_end) {
9775 		true_end = subord->nested_region_true_end;
9776 	}
9777 	__unused unsigned int ttecount = 0;
9778 
9779 	if (vrestart & PMAP_NEST_GRAND) {
9780 		goto nest_grand;
9781 	}
9782 
9783 	while (vaddr < true_end) {
9784 		stte_p = pmap_tte(subord, vaddr);
9785 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9786 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9787 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9788 
9789 			if (kr != KERN_SUCCESS) {
9790 				goto done;
9791 			}
9792 
9793 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9794 		}
9795 		vaddr += pt_attr_twig_size(pt_attr);
9796 		vrestart = vaddr;
9797 		++ttecount;
9798 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9799 		    pmap_pending_preemption())) {
9800 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9801 			kr = KERN_SUCCESS;
9802 			goto done;
9803 		}
9804 	}
9805 	/*
9806 	 * copy TTEs from subord pmap into grand pmap
9807 	 */
9808 
9809 	vaddr = (vm_map_offset_t) vstart;
9810 	if (vaddr < subord->nested_region_true_start) {
9811 		vaddr = subord->nested_region_true_start;
9812 	}
9813 	vrestart = vaddr | PMAP_NEST_GRAND;
9814 
9815 nest_grand:
9816 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9817 
9818 	if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9819 		kr = KERN_ABORTED;
9820 		goto done;
9821 	}
9822 	while (vaddr < true_end) {
9823 		gtte_p = pmap_tte(grand, vaddr);
9824 		if (gtte_p == PT_ENTRY_NULL) {
9825 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9826 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9827 			if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9828 				if (kr == KERN_SUCCESS) {
9829 					kr = KERN_ABORTED;
9830 				}
9831 			}
9832 
9833 			if (kr != KERN_SUCCESS) {
9834 				goto done;
9835 			}
9836 
9837 			gtte_p = pmap_tt2e(grand, vaddr);
9838 		}
9839 		/* Don't leak a page table page.  Don't violate break-before-make. */
9840 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9841 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9842 			    __func__, gtte_p, grand);
9843 		}
9844 		/**
9845 		 * It's possible that grand was trimmed by pmap_trim_internal() while the
9846 		 * lock was dropped, in which case the previously stored "true" start/end
9847 		 * will no longer be accurate.  In that case, we need to avoid nesting
9848 		 * tables outside the trimmed range, as those tables may be immediately freed
9849 		 * which would lead to a dangling page table pointer in grand.
9850 		 * Note that pmap_trim() may concurrently update grand's bounds as we are
9851 		 * making these checks, but in that case pmap_trim_range() has not yet
9852 		 * been called on grand and will wait for us to drop grand's lock, so it
9853 		 * should see any TTEs we've nested here and clear them appropriately.
9854 		 */
9855 		if (__probable((vaddr >= grand->nested_region_true_start) &&
9856 		    (vaddr < grand->nested_region_true_end))) {
9857 			stte_p = pmap_tte(subord, vaddr);
9858 			if (__improbable(stte_p == PT_ENTRY_NULL)) {
9859 				panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
9860 			}
9861 			*gtte_p = *stte_p;
9862 		}
9863 
9864 		vaddr += pt_attr_twig_size(pt_attr);
9865 		vrestart = vaddr | PMAP_NEST_GRAND;
9866 		++ttecount;
9867 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9868 		    pmap_pending_preemption())) {
9869 			break;
9870 		}
9871 	}
9872 	if (vaddr >= true_end) {
9873 		vrestart = vend | PMAP_NEST_GRAND;
9874 	}
9875 
9876 	kr = KERN_SUCCESS;
9877 done:
9878 
9879 	FLUSH_PTE();
9880 	__builtin_arm_isb(ISB_SY);
9881 
9882 	if (grand_locked) {
9883 		pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9884 	}
9885 
9886 nest_cleanup:
9887 #if XNU_MONITOR
9888 	if (kr != KERN_SUCCESS) {
9889 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9890 		*krp = kr;
9891 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9892 	}
9893 #else
9894 	if (kr != KERN_SUCCESS) {
9895 		*krp = kr;
9896 	}
9897 #endif
9898 	if (nested_region_unnested_table_bitmap != NULL) {
9899 #if XNU_MONITOR
9900 		pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9901 #else
9902 		kfree_data(nested_region_unnested_table_bitmap,
9903 		    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9904 #endif
9905 	}
9906 	if (new_nested_region_unnested_table_bitmap != NULL) {
9907 #if XNU_MONITOR
9908 		pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9909 #else
9910 		kfree_data(new_nested_region_unnested_table_bitmap,
9911 		    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9912 #endif
9913 	}
9914 	if (deref_subord) {
9915 #if XNU_MONITOR
9916 		os_atomic_dec(&subord->nested_count, relaxed);
9917 #endif
9918 		pmap_destroy_internal(subord);
9919 	}
9920 	return vrestart;
9921 }
9922 
9923 kern_return_t
9924 pmap_nest(
9925 	pmap_t grand,
9926 	pmap_t subord,
9927 	addr64_t vstart,
9928 	uint64_t size)
9929 {
9930 	kern_return_t kr = KERN_SUCCESS;
9931 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9932 	vm_map_offset_t vend = vaddr + size;
9933 	__unused vm_map_offset_t vlast = vaddr;
9934 
9935 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9936 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9937 	    VM_KERNEL_ADDRHIDE(vstart));
9938 
9939 	pmap_verify_preemptible();
9940 #if XNU_MONITOR
9941 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
9942 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9943 		if (kr == KERN_RESOURCE_SHORTAGE) {
9944 			pmap_alloc_page_for_ppl(0);
9945 			kr = KERN_SUCCESS;
9946 		} else if (kr == KERN_ABORTED) {
9947 			/**
9948 			 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
9949 			 * that it won't update kr when KERN_SUCCESS is to be returned.
9950 			 * Therefore, the KERN_ABORTED needs to be manually cleared here,
9951 			 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
9952 			 */
9953 			kr = KERN_SUCCESS;
9954 			continue;
9955 		} else if (kr != KERN_SUCCESS) {
9956 			break;
9957 		} else if (vaddr == vlast) {
9958 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9959 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9960 		}
9961 		vlast = vaddr;
9962 	}
9963 
9964 	pmap_ledger_check_balance(grand);
9965 	pmap_ledger_check_balance(subord);
9966 #else
9967 	/**
9968 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9969 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9970 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9971 	 */
9972 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9973 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9974 	}
9975 #endif
9976 
9977 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9978 
9979 	return kr;
9980 }
9981 
9982 /*
9983  *	kern_return_t pmap_unnest(grand, vaddr)
9984  *
9985  *	grand  = the pmap that will have the virtual range unnested
9986  *	vaddr  = start of range in pmap to be unnested
9987  *	size   = size of range in pmap to be unnested
9988  *
9989  */
9990 
9991 kern_return_t
9992 pmap_unnest(
9993 	pmap_t grand,
9994 	addr64_t vaddr,
9995 	uint64_t size)
9996 {
9997 	return pmap_unnest_options(grand, vaddr, size, 0);
9998 }
9999 
10000 /**
10001  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
10002  * from a top-level pmap ('grand').  The corresponding mappings in the nested
10003  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
10004  * still have the region nested.  The mappings in 'grand' will be left empty
10005  * with the assumption that they will be demand-filled by subsequent access faults.
10006  *
10007  * This function operates in 2 main phases:
10008  * 1. Iteration over the nested pmap's mappings for the specified range to mark
10009  *    them non-global.
10010  * 2. Clearing of the twig-level TTEs for the address range in grand.
10011  *
10012  * This function may return early due to pending AST_URGENT preemption; if so
10013  * it will indicate the need to be re-entered.
10014  *
10015  * @param grand pmap from which to unnest mappings
10016  * @param vaddr twig-aligned virtual address for the beginning of the nested range
10017  * @param size twig-aligned size of the nested range
10018  * @param vrestart the page-aligned starting address of the current call.  May contain
10019  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10020  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10021  *        grand is being torn down and step 1) above is not needed.
10022  *
10023  * @return the virtual address at which to restart the operation, possibly including
10024  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
10025  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10026  */
10027 MARK_AS_PMAP_TEXT vm_map_offset_t
10028 pmap_unnest_options_internal(
10029 	pmap_t grand,
10030 	addr64_t vaddr,
10031 	uint64_t size,
10032 	vm_map_offset_t vrestart,
10033 	unsigned int option)
10034 {
10035 	vm_map_offset_t start;
10036 	vm_map_offset_t addr;
10037 	tt_entry_t     *tte_p;
10038 	unsigned int    current_index;
10039 	unsigned int    start_index;
10040 	unsigned int    max_index;
10041 	unsigned int    entry_count = 0;
10042 
10043 	addr64_t vend;
10044 	addr64_t true_end;
10045 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10046 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10047 	}
10048 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10049 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10050 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10051 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10052 	}
10053 
10054 	validate_pmap_mutable(grand);
10055 
10056 	if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10057 		panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10058 	}
10059 
10060 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10061 
10062 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10063 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10064 		    (unsigned long long)vaddr, (unsigned long long)size);
10065 	}
10066 
10067 	if (__improbable(grand->nested_pmap == NULL)) {
10068 		panic("%s: %p has no nested pmap", __func__, grand);
10069 	}
10070 
10071 	true_end = vend;
10072 	if (true_end > grand->nested_pmap->nested_region_true_end) {
10073 		true_end = grand->nested_pmap->nested_region_true_end;
10074 	}
10075 
10076 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10077 		if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10078 			return vrestart;
10079 		}
10080 
10081 		start = vrestart;
10082 		if (start < grand->nested_pmap->nested_region_true_start) {
10083 			start = grand->nested_pmap->nested_region_true_start;
10084 		}
10085 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10086 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10087 		bool flush_tlb = false;
10088 
10089 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10090 			pt_entry_t  *bpte, *cpte;
10091 
10092 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10093 
10094 			bpte = pmap_pte(grand->nested_pmap, addr);
10095 
10096 			/*
10097 			 * If we've re-entered this function partway through unnesting a leaf region, the
10098 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10099 			 * the run of PTEs.  We therefore also need to check for a non-twig-aligned starting
10100 			 * address.
10101 			 */
10102 			if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10103 			    (addr & pt_attr_twig_offmask(pt_attr))) {
10104 				/*
10105 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
10106 				 * the nested pmap in this region will now be marked non-global.  Do this
10107 				 * before marking any of the PTEs within the region as non-global to avoid
10108 				 * the possibility of pmap_enter() subsequently inserting a global mapping
10109 				 * in the region, which could lead to a TLB conflict if a non-global entry
10110 				 * is later inserted for the same VA in a pmap which has fully unnested this
10111 				 * region.
10112 				 */
10113 				setbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10114 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10115 					pmap_paddr_t    pa;
10116 					unsigned int    pai = 0;
10117 					boolean_t               managed = FALSE;
10118 					pt_entry_t  spte;
10119 
10120 					if ((*cpte != ARM_PTE_TYPE_FAULT)
10121 					    && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10122 						spte = *((volatile pt_entry_t*)cpte);
10123 						while (!managed) {
10124 							pa = pte_to_pa(spte);
10125 							if (!pa_valid(pa)) {
10126 								break;
10127 							}
10128 							pai = pa_index(pa);
10129 							pvh_lock(pai);
10130 							spte = *((volatile pt_entry_t*)cpte);
10131 							pa = pte_to_pa(spte);
10132 							if (pai == pa_index(pa)) {
10133 								managed = TRUE;
10134 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
10135 							}
10136 							pvh_unlock(pai);
10137 						}
10138 
10139 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10140 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
10141 							flush_tlb = true;
10142 						}
10143 
10144 						if (managed) {
10145 							pvh_assert_locked(pai);
10146 							pvh_unlock(pai);
10147 						}
10148 					}
10149 
10150 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10151 					vrestart = addr;
10152 					++entry_count;
10153 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10154 					    pmap_pending_preemption())) {
10155 						goto unnest_subord_done;
10156 					}
10157 				}
10158 			}
10159 			addr = vlim;
10160 			vrestart = addr;
10161 			++entry_count;
10162 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10163 			    pmap_pending_preemption())) {
10164 				break;
10165 			}
10166 		}
10167 
10168 unnest_subord_done:
10169 		if (flush_tlb) {
10170 			FLUSH_PTE_STRONG();
10171 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10172 		}
10173 
10174 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10175 		if (current_index < max_index) {
10176 			return vrestart;
10177 		}
10178 	}
10179 
10180 	/*
10181 	 * invalidate all pdes for segment at vaddr in pmap grand
10182 	 */
10183 	if (vrestart & PMAP_NEST_GRAND) {
10184 		addr = vrestart & ~PMAP_NEST_GRAND;
10185 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10186 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10187 		}
10188 	} else {
10189 		addr = vaddr;
10190 		vrestart = vaddr | PMAP_NEST_GRAND;
10191 	}
10192 
10193 	/**
10194 	 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10195 	 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10196 	 * upon reentry.
10197 	 */
10198 	if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10199 		return vrestart;
10200 	}
10201 
10202 	if (addr < grand->nested_pmap->nested_region_true_start) {
10203 		addr = grand->nested_pmap->nested_region_true_start;
10204 	}
10205 
10206 	start = addr;
10207 
10208 	while (addr < true_end) {
10209 		tte_p = pmap_tte(grand, addr);
10210 		/*
10211 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10212 		 * so it's possible that a region we're trying to unnest may not have been
10213 		 * nested in the first place.
10214 		 */
10215 		if (tte_p != NULL) {
10216 			*tte_p = ARM_TTE_TYPE_FAULT;
10217 		}
10218 		addr += pt_attr_twig_size(pt_attr);
10219 		vrestart = addr | PMAP_NEST_GRAND;
10220 		++entry_count;
10221 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10222 		    pmap_pending_preemption())) {
10223 			break;
10224 		}
10225 	}
10226 	if (addr >= true_end) {
10227 		vrestart = vend | PMAP_NEST_GRAND;
10228 	}
10229 
10230 	FLUSH_PTE_STRONG();
10231 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10232 
10233 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10234 
10235 	return vrestart;
10236 }
10237 
10238 kern_return_t
10239 pmap_unnest_options(
10240 	pmap_t grand,
10241 	addr64_t vaddr,
10242 	uint64_t size,
10243 	unsigned int option)
10244 {
10245 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10246 	vm_map_offset_t vend = vaddr + size;
10247 
10248 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10249 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10250 
10251 	pmap_verify_preemptible();
10252 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
10253 #if XNU_MONITOR
10254 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10255 #else
10256 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10257 #endif
10258 	}
10259 
10260 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10261 
10262 	return KERN_SUCCESS;
10263 }
10264 
10265 boolean_t
10266 pmap_adjust_unnest_parameters(
10267 	__unused pmap_t p,
10268 	__unused vm_map_offset_t *s,
10269 	__unused vm_map_offset_t *e)
10270 {
10271 	return TRUE; /* to get to log_unnest_badness()... */
10272 }
10273 
10274 #if PMAP_FORK_NEST
10275 /**
10276  * Perform any necessary pre-nesting of the parent's shared region at fork()
10277  * time.
10278  *
10279  * @note This should only be called from vm_map_fork().
10280  *
10281  * @param old_pmap The pmap of the parent task.
10282  * @param new_pmap The pmap of the child task.
10283  * @param nesting_start An output parameter that is updated with the start
10284  *                      address of the range that was pre-nested
10285  * @param nesting_end An output parameter that is updated with the end
10286  *                      address of the range that was pre-nested
10287  *
10288  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10289  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
10290  */
10291 kern_return_t
10292 pmap_fork_nest(
10293 	pmap_t old_pmap,
10294 	pmap_t new_pmap,
10295 	vm_map_offset_t *nesting_start,
10296 	vm_map_offset_t *nesting_end)
10297 {
10298 	if (old_pmap == NULL || new_pmap == NULL) {
10299 		return KERN_INVALID_ARGUMENT;
10300 	}
10301 	if (old_pmap->nested_pmap == NULL) {
10302 		return KERN_SUCCESS;
10303 	}
10304 	pmap_nest(new_pmap,
10305 	    old_pmap->nested_pmap,
10306 	    old_pmap->nested_region_addr,
10307 	    old_pmap->nested_region_size);
10308 	assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10309 	    new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10310 	    new_pmap->nested_region_size == old_pmap->nested_region_size,
10311 	    "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10312 	    new_pmap->nested_pmap,
10313 	    new_pmap->nested_region_addr,
10314 	    new_pmap->nested_region_size,
10315 	    old_pmap->nested_pmap,
10316 	    old_pmap->nested_region_addr,
10317 	    old_pmap->nested_region_size);
10318 	*nesting_start = old_pmap->nested_region_addr;
10319 	*nesting_end = *nesting_start + old_pmap->nested_region_size;
10320 	return KERN_SUCCESS;
10321 }
10322 #endif /* PMAP_FORK_NEST */
10323 
10324 /*
10325  * disable no-execute capability on
10326  * the specified pmap
10327  */
10328 #if DEVELOPMENT || DEBUG
10329 void
10330 pmap_disable_NX(
10331 	pmap_t pmap)
10332 {
10333 	pmap->nx_enabled = FALSE;
10334 }
10335 #else
10336 void
10337 pmap_disable_NX(
10338 	__unused pmap_t pmap)
10339 {
10340 }
10341 #endif
10342 
10343 /*
10344  * flush a range of hardware TLB entries.
10345  * NOTE: assumes the smallest TLB entry in use will be for
10346  * an ARM small page (4K).
10347  */
10348 
10349 #if __ARM_RANGE_TLBI__
10350 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10351 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
10352 #else
10353 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10354 #endif // __ARM_RANGE_TLBI__
10355 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10356     "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10357     "of npages to 32 bits below may truncate.");
10358 
10359 static void
10360 flush_mmu_tlb_region_asid_async(
10361 	vm_offset_t va,
10362 	size_t length,
10363 	pmap_t pmap,
10364 	bool last_level_only __unused,
10365 	bool strong __unused)
10366 {
10367 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10368 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10369 	size_t npages = length >> pmap_page_shift;
10370 	uint32_t asid;
10371 
10372 	asid = pmap->hw_asid;
10373 
10374 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10375 		boolean_t       flush_all = FALSE;
10376 
10377 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10378 			flush_all = TRUE;
10379 		}
10380 		if (flush_all) {
10381 			flush_mmu_tlb_async();
10382 		} else {
10383 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10384 		}
10385 		return;
10386 	}
10387 #if __ARM_RANGE_TLBI__
10388 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10389 		/**
10390 		 * Note that casting npages to 32 bits here is always safe thanks to
10391 		 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10392 		 */
10393 		va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10394 		if (pmap->type == PMAP_TYPE_NESTED) {
10395 			flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10396 		} else {
10397 			flush_mmu_tlb_range_async(va, last_level_only, strong);
10398 		}
10399 		return;
10400 	}
10401 #endif
10402 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10403 	va = tlbi_asid(asid) | tlbi_addr(va);
10404 
10405 	if (pmap->type == PMAP_TYPE_NESTED) {
10406 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10407 	} else {
10408 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10409 	}
10410 }
10411 
10412 MARK_AS_PMAP_TEXT static void
10413 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10414 {
10415 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10416 }
10417 
10418 void
10419 flush_mmu_tlb_region(
10420 	vm_offset_t va,
10421 	unsigned length)
10422 {
10423 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10424 	sync_tlb_flush();
10425 }
10426 
10427 unsigned int
10428 pmap_cache_attributes(
10429 	ppnum_t pn)
10430 {
10431 	pmap_paddr_t    paddr;
10432 	unsigned int    pai;
10433 	unsigned int    result;
10434 	pp_attr_t       pp_attr_current;
10435 
10436 	paddr = ptoa(pn);
10437 
10438 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10439 
10440 	if (!pa_valid(paddr)) {
10441 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10442 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10443 	}
10444 
10445 	result = VM_WIMG_DEFAULT;
10446 
10447 	pai = pa_index(paddr);
10448 
10449 	pp_attr_current = pp_attr_table[pai];
10450 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10451 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10452 	}
10453 	return result;
10454 }
10455 
10456 MARK_AS_PMAP_TEXT static void
10457 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10458 {
10459 	if ((wimg_bits_prev != wimg_bits_new)
10460 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10461 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10462 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10463 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10464 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10465 		pmap_sync_page_attributes_phys(pn);
10466 	}
10467 
10468 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10469 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10470 	}
10471 }
10472 
10473 MARK_AS_PMAP_TEXT __unused void
10474 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10475 {
10476 	pmap_paddr_t paddr = ptoa(pn);
10477 	const unsigned int pai = pa_index(paddr);
10478 
10479 	if (__improbable(!pa_valid(paddr))) {
10480 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10481 	}
10482 
10483 	pvh_lock(pai);
10484 
10485 #if XNU_MONITOR
10486 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10487 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10488 	}
10489 #endif
10490 
10491 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10492 
10493 	pvh_unlock(pai);
10494 
10495 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10496 }
10497 
10498 void *
10499 pmap_map_compressor_page(ppnum_t pn)
10500 {
10501 #if __ARM_PTE_PHYSMAP__
10502 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10503 	if (cacheattr != VM_WIMG_DEFAULT) {
10504 #if XNU_MONITOR
10505 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10506 #else
10507 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10508 #endif
10509 	}
10510 #endif
10511 	return (void*)phystokv(ptoa(pn));
10512 }
10513 
10514 void
10515 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10516 {
10517 #if __ARM_PTE_PHYSMAP__
10518 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10519 	if (cacheattr != VM_WIMG_DEFAULT) {
10520 #if XNU_MONITOR
10521 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10522 #else
10523 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10524 #endif
10525 	}
10526 #endif
10527 }
10528 
10529 /**
10530  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10531  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10532  *
10533  * @param user_page_list List of pages to be updated.
10534  * @param page_cnt Number of pages in total in user_page_list.
10535  * @param cacheattr The new cache attribute.
10536  *
10537  * @return Success if true is returned.
10538  */
10539 bool
10540 pmap_batch_set_cache_attributes(
10541 	upl_page_info_array_t user_page_list,
10542 	unsigned int page_cnt,
10543 	unsigned int cacheattr)
10544 {
10545 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10546 
10547 	if (page_cnt == 0) {
10548 		return true;
10549 	}
10550 
10551 	batch_set_cache_attr_state_t states;
10552 	states.page_index = 0;
10553 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10554 	states.tlb_flush_pass_needed = false;
10555 	states.rt_cache_flush_pass_needed = false;
10556 
10557 	/* Verify we are being called from a preemptible context. */
10558 	pmap_verify_preemptible();
10559 
10560 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10561 #if XNU_MONITOR
10562 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10563 #else /* !XNU_MONITOR */
10564 		states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10565 #endif /* XNU_MONITOR */
10566 	}
10567 
10568 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10569 	return true;
10570 }
10571 
10572 /**
10573  * Flushes TLB entries associated with the page specified by paddr, but do not
10574  * issue barriers yet.
10575  *
10576  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10577  */
10578 MARK_AS_PMAP_TEXT static void
10579 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10580 {
10581 #if __ARM_PTE_PHYSMAP__
10582 	/* Flush the physical aperture mappings. */
10583 	const vm_offset_t kva = phystokv(paddr);
10584 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10585 #endif /* __ARM_PTE_PHYSMAP__ */
10586 
10587 	/* Flush the mappings tracked in the ptes. */
10588 	const unsigned int pai = pa_index(paddr);
10589 	pv_entry_t **pv_h = pai_to_pvh(pai);
10590 
10591 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10592 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10593 
10594 	pvh_assert_locked(pai);
10595 
10596 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10597 		pte_p = pvh_ptep(pv_h);
10598 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10599 		pve_p = pvh_pve_list(pv_h);
10600 		pte_p = PT_ENTRY_NULL;
10601 	}
10602 
10603 	int pve_ptep_idx = 0;
10604 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10605 		if (pve_p != PV_ENTRY_NULL) {
10606 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10607 			if (pte_p == PT_ENTRY_NULL) {
10608 				goto flush_tlb_skip_pte;
10609 			}
10610 		}
10611 
10612 #ifdef PVH_FLAG_IOMMU
10613 		if (pvh_ptep_is_iommu(pte_p)) {
10614 			goto flush_tlb_skip_pte;
10615 		}
10616 #endif /* PVH_FLAG_IOMMU */
10617 		pmap_t pmap = ptep_get_pmap(pte_p);
10618 		vm_map_address_t va = ptep_get_va(pte_p);
10619 
10620 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10621 		    pmap, true, false);
10622 
10623 flush_tlb_skip_pte:
10624 		pte_p = PT_ENTRY_NULL;
10625 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10626 			pve_ptep_idx = 0;
10627 			pve_p = pve_next(pve_p);
10628 		}
10629 	}
10630 }
10631 
10632 /**
10633  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10634  *
10635  * @param pai The Physical Address Index of the entry.
10636  * @param cacheattr The new cache attribute.
10637  */
10638 MARK_AS_PMAP_TEXT static void
10639 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10640 {
10641 	pvh_assert_locked(pai);
10642 
10643 	pp_attr_t pp_attr_current, pp_attr_template;
10644 	do {
10645 		pp_attr_current = pp_attr_table[pai];
10646 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10647 
10648 		/**
10649 		 * WIMG bits should only be updated under the PVH lock, but we should do
10650 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10651 		 */
10652 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10653 }
10654 
10655 /**
10656  * Batch updates the cache attributes of a list of pages in three passes.
10657  *
10658  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10659  * In pass two, TLB entries are flushed for each page in the list if necessary.
10660  * In pass three, caches are cleaned for each page in the list if necessary.
10661  *
10662  * When running in PPL, this function may decide to return to the caller in response
10663  * to AST_URGENT.
10664  *
10665  * @param user_page_list List of pages to be updated.
10666  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10667  * @param page_cnt Number of pages in total in user_page_list.
10668  * @param cacheattr The new cache attributes.
10669  *
10670  * @return The new state of the state machine.
10671  */
10672 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10673 pmap_batch_set_cache_attributes_internal(
10674 #if XNU_MONITOR
10675 	volatile upl_page_info_t *user_page_list,
10676 #else /* !XNU_MONITOR */
10677 	upl_page_info_array_t user_page_list,
10678 #endif /* XNU_MONITOR */
10679 	batch_set_cache_attr_state_t states,
10680 	unsigned int page_cnt,
10681 	unsigned int cacheattr)
10682 {
10683 	uint64_t page_index = states.page_index;
10684 	uint64_t state = states.state;
10685 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10686 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10687 
10688 	/* For verifying progress. */
10689 	__assert_only const uint64_t page_index_old = page_index;
10690 	__assert_only const uint64_t state_old = state;
10691 
10692 	/* Assert page_index and state are within their range. */
10693 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10694 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10695 	}
10696 
10697 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10698 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10699 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10700 		while (page_index < page_cnt) {
10701 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10702 			const pmap_paddr_t paddr = ptoa(pn);
10703 
10704 			if (!pa_valid(paddr)) {
10705 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10706 			}
10707 
10708 			const unsigned int pai = pa_index(paddr);
10709 
10710 			/* Lock the page. */
10711 			pvh_lock(pai);
10712 
10713 #if XNU_MONITOR
10714 			if (ppattr_pa_test_monitor(paddr)) {
10715 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10716 			}
10717 #endif /* XNU_MONITOR */
10718 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10719 
10720 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10721 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10722 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10723 			}
10724 
10725 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10726 
10727 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10728 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10729 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10730 			}
10731 
10732 			/* Update the cache attributes in PTE and PP_ATTR table. */
10733 			if (wimg_bits_new != wimg_bits_prev) {
10734 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10735 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10736 			}
10737 
10738 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10739 				rt_cache_flush_pass_needed = true;
10740 			}
10741 
10742 			pvh_unlock(pai);
10743 
10744 			page_index++;
10745 
10746 #if XNU_MONITOR
10747 			/**
10748 			 * Check for AST_URGENT every page, as the pve list search in cache
10749 			 * update can take non-constant time.
10750 			 */
10751 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10752 				goto pbscai_exit;
10753 			}
10754 #endif /* XNU_MONITOR */
10755 		}
10756 
10757 		/* page_index == page_cnt && !pmap_pending_preemption() */
10758 		if (tlb_flush_pass_needed) {
10759 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10760 		} else if (rt_cache_flush_pass_needed) {
10761 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10762 		} else {
10763 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10764 		}
10765 		page_index = 0;
10766 
10767 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10768 		FLUSH_PTE_STRONG();
10769 
10770 #if XNU_MONITOR
10771 		if (__improbable(pmap_pending_preemption())) {
10772 			goto pbscai_exit;
10773 		}
10774 #endif /* XNU_MONITOR */
10775 	}
10776 
10777 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10778 		/**
10779 		 * Pass 2: for each physical page and for each mapping, we need to flush
10780 		 * the TLB for it.
10781 		 */
10782 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10783 		while (page_index < page_cnt) {
10784 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10785 
10786 			const pmap_paddr_t paddr = ptoa(pn);
10787 			if (!pa_valid(paddr)) {
10788 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10789 			}
10790 
10791 			const unsigned int pai = pa_index(paddr);
10792 
10793 			pvh_lock(pai);
10794 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10795 			pvh_unlock(pai);
10796 
10797 			page_index++;
10798 
10799 #if XNU_MONITOR
10800 			/**
10801 			 * Check for AST_URGENT every page, as the pve list search in cache
10802 			 * update can take non-constant time.
10803 			 */
10804 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10805 				goto pbscai_exit;
10806 			}
10807 #endif /* XNU_MONITOR */
10808 		}
10809 
10810 #if HAS_FEAT_XS
10811 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10812 		arm64_sync_tlb(false);
10813 #else
10814 		/**
10815 		 * For targets that distinguish between mild and strong DSB, mild DSB
10816 		 * will not drain the prefetcher.  This can lead to prefetch-driven
10817 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
10818 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10819 		 */
10820 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10821 #endif
10822 
10823 		if (rt_cache_flush_pass_needed) {
10824 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10825 		} else {
10826 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10827 		}
10828 		page_index = 0;
10829 
10830 #if XNU_MONITOR
10831 		if (__improbable(pmap_pending_preemption())) {
10832 			goto pbscai_exit;
10833 		}
10834 #endif /* XNU_MONITOR */
10835 	}
10836 
10837 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10838 		/* Pass 3: Flush the cache if the page is recently set to RT */
10839 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10840 #if !XNU_MONITOR
10841 		/**
10842 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10843 		 * in the state where DC by VA instructions remain enabled.
10844 		 */
10845 		disable_preemption();
10846 #endif /* !XNU_MONITOR */
10847 
10848 		assert(get_preemption_level() > 0);
10849 
10850 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10851 		/**
10852 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10853 		 * and the host will handle cache maintenance for it. So we don't need to
10854 		 * worry about enabling the ops here for AVP.
10855 		 */
10856 		enable_dc_mva_ops();
10857 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10858 
10859 		while (page_index < page_cnt) {
10860 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10861 
10862 			if (!pa_valid(paddr)) {
10863 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10864 			}
10865 
10866 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10867 
10868 			page_index++;
10869 
10870 #if XNU_MONITOR
10871 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10872 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10873 				disable_dc_mva_ops();
10874 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10875 				goto pbscai_exit;
10876 			}
10877 #endif /* XNU_MONITOR */
10878 		}
10879 
10880 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10881 		disable_dc_mva_ops();
10882 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10883 
10884 #if !XNU_MONITOR
10885 		enable_preemption();
10886 #endif /* !XNU_MONITOR */
10887 
10888 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10889 		page_index = 0;
10890 	}
10891 
10892 #if XNU_MONITOR
10893 pbscai_exit:
10894 #endif /* XNU_MONITOR */
10895 	/* Assert page_index and state are within their range. */
10896 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10897 
10898 	/* Make sure we are making progress in this call. */
10899 	assert(page_index > page_index_old || state > state_old);
10900 
10901 	batch_set_cache_attr_state_t states_new;
10902 	states_new.page_index = page_index;
10903 	states_new.state = state;
10904 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10905 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10906 	return states_new;
10907 }
10908 
10909 MARK_AS_PMAP_TEXT static void
10910 pmap_set_cache_attributes_priv(
10911 	ppnum_t pn,
10912 	unsigned int cacheattr,
10913 	boolean_t external __unused)
10914 {
10915 	pmap_paddr_t    paddr;
10916 	unsigned int    pai;
10917 	pp_attr_t       pp_attr_current;
10918 	pp_attr_t       pp_attr_template;
10919 	unsigned int    wimg_bits_prev, wimg_bits_new;
10920 
10921 	paddr = ptoa(pn);
10922 
10923 	if (!pa_valid(paddr)) {
10924 		return;                         /* Not a managed page. */
10925 	}
10926 
10927 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
10928 		cacheattr = VM_WIMG_DEFAULT;
10929 	}
10930 
10931 	pai = pa_index(paddr);
10932 
10933 	pvh_lock(pai);
10934 
10935 #if XNU_MONITOR
10936 	if (external && ppattr_pa_test_monitor(paddr)) {
10937 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10938 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
10939 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10940 	}
10941 #endif
10942 
10943 	do {
10944 		pp_attr_current = pp_attr_table[pai];
10945 		wimg_bits_prev = VM_WIMG_DEFAULT;
10946 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10947 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10948 		}
10949 
10950 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10951 
10952 		/**
10953 		 * WIMG bits should only be updated under the PVH lock, but we should do
10954 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10955 		 */
10956 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10957 
10958 	wimg_bits_new = VM_WIMG_DEFAULT;
10959 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10960 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10961 	}
10962 
10963 	if (wimg_bits_new != wimg_bits_prev) {
10964 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
10965 	}
10966 
10967 	pvh_unlock(pai);
10968 
10969 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10970 }
10971 
10972 MARK_AS_PMAP_TEXT void
10973 pmap_set_cache_attributes_internal(
10974 	ppnum_t pn,
10975 	unsigned int cacheattr)
10976 {
10977 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10978 }
10979 
10980 void
10981 pmap_set_cache_attributes(
10982 	ppnum_t pn,
10983 	unsigned int cacheattr)
10984 {
10985 #if XNU_MONITOR
10986 	pmap_set_cache_attributes_ppl(pn, cacheattr);
10987 #else
10988 	pmap_set_cache_attributes_internal(pn, cacheattr);
10989 #endif
10990 }
10991 
10992 /**
10993  * Updates the page numbered ppnum to have attribute specified by attributes.
10994  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10995  * The necessity of the TLB flush is returned in case this function is called
10996  * in a batched manner and the TLB flush is intended to be done at a different
10997  * timing.
10998  *
10999  * @param ppnum Page Number of the page to be updated.
11000  * @param attributes The new cache attributes.
11001  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
11002  *        immediately.
11003  *
11004  * @return Returns true if a TLB flush is needed for this update regardless of
11005  *         whether a flush has occurred already.
11006  */
11007 MARK_AS_PMAP_TEXT bool
11008 pmap_update_cache_attributes_locked(
11009 	ppnum_t ppnum,
11010 	unsigned attributes,
11011 	bool perform_tlbi)
11012 {
11013 	pmap_paddr_t    phys = ptoa(ppnum);
11014 	pv_entry_t      *pve_p;
11015 	pt_entry_t      *pte_p;
11016 	pv_entry_t      **pv_h;
11017 	pt_entry_t      tmplate;
11018 	unsigned int    pai;
11019 	boolean_t       tlb_flush_needed = false;
11020 
11021 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11022 
11023 	if (pmap_panic_dev_wimg_on_managed) {
11024 		switch (attributes & VM_WIMG_MASK) {
11025 		case VM_WIMG_IO:                        // nGnRnE
11026 		case VM_WIMG_POSTED:                    // nGnRE
11027 		/* supported on DRAM, but slow, so we disallow */
11028 
11029 		case VM_WIMG_POSTED_REORDERED:          // nGRE
11030 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11031 			/* unsupported on DRAM */
11032 
11033 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11034 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11035 			break;
11036 
11037 		default:
11038 			/* not device type memory, all good */
11039 
11040 			break;
11041 		}
11042 	}
11043 
11044 #if __ARM_PTE_PHYSMAP__
11045 	vm_offset_t kva = phystokv(phys);
11046 	pte_p = pmap_pte(kernel_pmap, kva);
11047 
11048 	tmplate = *pte_p;
11049 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11050 #if XNU_MONITOR
11051 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11052 #else
11053 	tmplate |= wimg_to_pte(attributes, phys);
11054 #endif
11055 	if (tmplate & ARM_PTE_HINT_MASK) {
11056 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11057 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
11058 	}
11059 
11060 	if (perform_tlbi) {
11061 		write_pte_strong(pte_p, tmplate);
11062 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11063 	} else {
11064 		write_pte_fast(pte_p, tmplate);
11065 	}
11066 	tlb_flush_needed = true;
11067 #endif
11068 
11069 	pai = pa_index(phys);
11070 
11071 	pv_h = pai_to_pvh(pai);
11072 
11073 	pte_p = PT_ENTRY_NULL;
11074 	pve_p = PV_ENTRY_NULL;
11075 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11076 		pte_p = pvh_ptep(pv_h);
11077 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11078 		pve_p = pvh_pve_list(pv_h);
11079 		pte_p = PT_ENTRY_NULL;
11080 	}
11081 
11082 	int pve_ptep_idx = 0;
11083 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11084 		vm_map_address_t va;
11085 		pmap_t          pmap;
11086 
11087 		if (pve_p != PV_ENTRY_NULL) {
11088 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11089 			if (pte_p == PT_ENTRY_NULL) {
11090 				goto cache_skip_pve;
11091 			}
11092 		}
11093 
11094 #ifdef PVH_FLAG_IOMMU
11095 		if (pvh_ptep_is_iommu(pte_p)) {
11096 			goto cache_skip_pve;
11097 		}
11098 #endif
11099 		pmap = ptep_get_pmap(pte_p);
11100 #if HAS_FEAT_XS
11101 		/**
11102 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11103 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11104 		 */
11105 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11106 #endif /* HAS_FEAT_XS */
11107 		va = ptep_get_va(pte_p);
11108 
11109 		tmplate = *pte_p;
11110 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11111 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11112 
11113 		if (perform_tlbi) {
11114 			write_pte_strong(pte_p, tmplate);
11115 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11116 			    pmap, true, false);
11117 		} else {
11118 			write_pte_fast(pte_p, tmplate);
11119 		}
11120 		tlb_flush_needed = true;
11121 
11122 cache_skip_pve:
11123 		pte_p = PT_ENTRY_NULL;
11124 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11125 			pve_ptep_idx = 0;
11126 			pve_p = pve_next(pve_p);
11127 		}
11128 	}
11129 	if (perform_tlbi && tlb_flush_needed) {
11130 #if HAS_FEAT_XS
11131 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11132 		arm64_sync_tlb(false);
11133 #else
11134 		/**
11135 		 * For targets that distinguish between mild and strong DSB, mild DSB
11136 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11137 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11138 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11139 		 */
11140 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11141 #endif
11142 	}
11143 
11144 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11145 
11146 	return tlb_flush_needed;
11147 }
11148 
11149 /**
11150  * Mark a pmap as being dedicated to use for a commpage mapping.
11151  * The pmap itself will never be activated on a CPU; its mappings will
11152  * only be embedded in userspace pmaps at a fixed virtual address.
11153  *
11154  * @param pmap the pmap to mark as belonging to a commpage.
11155  */
11156 static void
11157 pmap_set_commpage(pmap_t pmap)
11158 {
11159 #if XNU_MONITOR
11160 	assert(!pmap_ppl_locked_down);
11161 #endif
11162 	assert(pmap->type == PMAP_TYPE_USER);
11163 	pmap->type = PMAP_TYPE_COMMPAGE;
11164 	/*
11165 	 * Free the pmap's ASID.  This pmap should not ever be directly
11166 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
11167 	 * ASID space contention but will also cause pmap_switch() to panic
11168 	 * if an attacker tries to activate this pmap.  Disable preemption to
11169 	 * accommodate the *_nopreempt spinlock in free_asid().
11170 	 */
11171 	mp_disable_preemption();
11172 	pmap_get_pt_ops(pmap)->free_id(pmap);
11173 	mp_enable_preemption();
11174 }
11175 
11176 static void
11177 pmap_update_tt3e(
11178 	pmap_t pmap,
11179 	vm_address_t address,
11180 	tt_entry_t template)
11181 {
11182 	tt_entry_t *ptep, pte;
11183 
11184 	ptep = pmap_tt3e(pmap, address);
11185 	if (ptep == NULL) {
11186 		panic("%s: no ptep?", __FUNCTION__);
11187 	}
11188 
11189 	pte = *ptep;
11190 	pte = tte_to_pa(pte) | template;
11191 	write_pte_strong(ptep, pte);
11192 }
11193 
11194 /* Note absence of non-global bit */
11195 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11196 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11197 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11198 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11199 
11200 /* Note absence of non-global bit and no-execute bit.  */
11201 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11202 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11203 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11204 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11205 
11206 void
11207 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11208     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11209 {
11210 	kern_return_t kr;
11211 	pmap_paddr_t data_pa = 0; // data address
11212 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11213 	pmap_paddr_t text_pa = 0; // text address
11214 
11215 	*kernel_data_addr = 0;
11216 	*kernel_text_addr = 0;
11217 	*user_text_addr = 0;
11218 
11219 #if XNU_MONITOR
11220 	data_pa = pmap_alloc_page_for_kern(0);
11221 	assert(data_pa);
11222 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11223 	ro_data_pa = pmap_alloc_page_for_kern(0);
11224 	assert(ro_data_pa);
11225 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11226 #if CONFIG_ARM_PFZ
11227 	text_pa = pmap_alloc_page_for_kern(0);
11228 	assert(text_pa);
11229 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11230 #endif
11231 
11232 #else /* XNU_MONITOR */
11233 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11234 	/*
11235 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11236 	 * mapped at page granularity, so a separate page for kernel RO data would not
11237 	 * be useful.
11238 	 */
11239 	ro_data_pa = data_pa;
11240 #if CONFIG_ARM_PFZ
11241 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11242 #endif
11243 
11244 #endif /* XNU_MONITOR */
11245 
11246 	/*
11247 	 * In order to avoid burning extra pages on mapping the shared page, we
11248 	 * create a dedicated pmap for the shared page.  We forcibly nest the
11249 	 * translation tables from this pmap into other pmaps.  The level we
11250 	 * will nest at depends on the MMU configuration (page size, TTBR range,
11251 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11252 	 *
11253 	 * Note that this is NOT "the nested pmap" (which is used to nest the
11254 	 * shared cache).
11255 	 *
11256 	 * Note that we update parameters of the entry for our unique needs (NG
11257 	 * entry, etc.).
11258 	 */
11259 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11260 	assert(commpage_pmap_default != NULL);
11261 	pmap_set_commpage(commpage_pmap_default);
11262 
11263 	/* The user 64-bit mappings... */
11264 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11265 	assert(kr == KERN_SUCCESS);
11266 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11267 
11268 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11269 	assert(kr == KERN_SUCCESS);
11270 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11271 #if CONFIG_ARM_PFZ
11272 	/* User mapping of comm page text section for 64 bit mapping only
11273 	 *
11274 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11275 	 * user processes to get this page mapped in, they should never call into
11276 	 * this page.
11277 	 *
11278 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11279 	 * is slid in the same L3 as the data commpage.  It is either outside the
11280 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11281 	 * it is reserved and unavailable to mach VM for future mappings.
11282 	 */
11283 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11284 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11285 
11286 	vm_map_address_t commpage_text_va = 0;
11287 
11288 	do {
11289 		int text_leaf_index = random() % num_ptes;
11290 
11291 		// Generate a VA for the commpage text with the same root and twig index as data
11292 		// comm page, but with new leaf index we've just generated.
11293 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11294 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11295 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11296 
11297 	// Assert that this is empty
11298 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11299 	assert(ptep != PT_ENTRY_NULL);
11300 	assert(*ptep == ARM_TTE_EMPTY);
11301 
11302 	// At this point, we've found the address we want to insert our comm page at
11303 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11304 	assert(kr == KERN_SUCCESS);
11305 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11306 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11307 
11308 	*user_text_addr = commpage_text_va;
11309 #endif
11310 
11311 	/* ...and the user 32-bit mappings. */
11312 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11313 	assert(kr == KERN_SUCCESS);
11314 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11315 
11316 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11317 	assert(kr == KERN_SUCCESS);
11318 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11319 #if __ARM_MIXED_PAGE_SIZE__
11320 	/**
11321 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11322 	 * new set of page tables that point to the exact same 16K shared page as
11323 	 * before. Only the first 4K of the 16K shared page is mapped since that's
11324 	 * the only part that contains relevant data.
11325 	 */
11326 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11327 	assert(commpage_pmap_4k != NULL);
11328 	pmap_set_commpage(commpage_pmap_4k);
11329 
11330 	/* The user 64-bit mappings... */
11331 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11332 	assert(kr == KERN_SUCCESS);
11333 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11334 
11335 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11336 	assert(kr == KERN_SUCCESS);
11337 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11338 
11339 	/* ...and the user 32-bit mapping. */
11340 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11341 	assert(kr == KERN_SUCCESS);
11342 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11343 
11344 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11345 	assert(kr == KERN_SUCCESS);
11346 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11347 #endif
11348 
11349 	/* For manipulation in kernel, go straight to physical page */
11350 	*kernel_data_addr = phystokv(data_pa);
11351 	assert(commpage_ro_data_kva == 0);
11352 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11353 	assert(commpage_text_kva == 0);
11354 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11355 }
11356 
11357 
11358 /*
11359  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11360  * with user controlled TTEs for regions that aren't explicitly reserved by the
11361  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11362  */
11363 #if (ARM_PGSHIFT == 14)
11364 /**
11365  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11366  * commpage completely above the maximum 32-bit userspace VA.
11367  */
11368 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11369 
11370 /**
11371  * Normally there'd be an assert to check that 64-bit devices with 64-bit
11372  * userspace VAs can nest the commpage completely above the maximum 64-bit
11373  * userpace VA, but that technically isn't true on macOS. On those systems, the
11374  * commpage lives within the userspace VA range, but is protected by the VM as
11375  * a reserved region (see vm_reserved_regions[] definition for more info).
11376  */
11377 
11378 #elif (ARM_PGSHIFT == 12)
11379 /**
11380  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11381  * above the maximum userspace VA.
11382  */
11383 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11384 #else
11385 #error Nested shared page mapping is unsupported on this config
11386 #endif
11387 
11388 MARK_AS_PMAP_TEXT kern_return_t
11389 pmap_insert_commpage_internal(
11390 	pmap_t pmap)
11391 {
11392 	kern_return_t kr = KERN_SUCCESS;
11393 	vm_offset_t commpage_vaddr;
11394 	pt_entry_t *ttep, *src_ttep;
11395 	int options = 0;
11396 	pmap_t commpage_pmap = commpage_pmap_default;
11397 
11398 	/* Validate the pmap input before accessing its data. */
11399 	validate_pmap_mutable(pmap);
11400 
11401 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11402 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11403 
11404 #if __ARM_MIXED_PAGE_SIZE__
11405 #if !__ARM_16K_PG__
11406 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11407 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11408 #endif /* !__ARM_16K_PG__ */
11409 
11410 	/* Choose the correct shared page pmap to use. */
11411 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11412 	if (pmap_page_size == 16384) {
11413 		commpage_pmap = commpage_pmap_default;
11414 	} else if (pmap_page_size == 4096) {
11415 		commpage_pmap = commpage_pmap_4k;
11416 	} else {
11417 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11418 	}
11419 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11420 
11421 #if XNU_MONITOR
11422 	options |= PMAP_OPTIONS_NOWAIT;
11423 #endif /* XNU_MONITOR */
11424 
11425 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11426 #error We assume a single page.
11427 #endif
11428 
11429 	if (pmap_is_64bit(pmap)) {
11430 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11431 	} else {
11432 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11433 	}
11434 
11435 
11436 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11437 
11438 	/*
11439 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11440 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11441 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11442 	 * to "nest".
11443 	 *
11444 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11445 	 * nesting just means inserting pointers to pre-allocated tables inside of
11446 	 * the passed in pmap to allow us to share page tables (which map the shared
11447 	 * page) for every task. This saves at least one page of memory per process
11448 	 * compared to creating new page tables in every process for mapping the
11449 	 * shared page.
11450 	 */
11451 
11452 	/**
11453 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11454 	 * page's tables into place.
11455 	 */
11456 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11457 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11458 
11459 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11460 
11461 		if (kr != KERN_SUCCESS) {
11462 #if XNU_MONITOR
11463 			if (kr == KERN_RESOURCE_SHORTAGE) {
11464 				return kr;
11465 			} else
11466 #endif
11467 			if (kr == KERN_ABORTED) {
11468 				return kr;
11469 			} else {
11470 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11471 			}
11472 		}
11473 
11474 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11475 	}
11476 
11477 	if (*ttep != ARM_PTE_EMPTY) {
11478 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11479 	}
11480 
11481 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11482 
11483 	*ttep = *src_ttep;
11484 	FLUSH_PTE_STRONG();
11485 
11486 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11487 
11488 	return kr;
11489 }
11490 
11491 static void
11492 pmap_unmap_commpage(
11493 	pmap_t pmap)
11494 {
11495 	pt_entry_t *ttep;
11496 	vm_offset_t commpage_vaddr;
11497 	pmap_t commpage_pmap = commpage_pmap_default;
11498 
11499 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11500 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11501 
11502 #if __ARM_MIXED_PAGE_SIZE__
11503 #if !__ARM_16K_PG__
11504 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11505 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11506 #endif /* !__ARM_16K_PG__ */
11507 
11508 	/* Choose the correct shared page pmap to use. */
11509 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11510 	if (pmap_page_size == 16384) {
11511 		commpage_pmap = commpage_pmap_default;
11512 	} else if (pmap_page_size == 4096) {
11513 		commpage_pmap = commpage_pmap_4k;
11514 	} else {
11515 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11516 	}
11517 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11518 
11519 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11520 #error We assume a single page.
11521 #endif
11522 
11523 	if (pmap_is_64bit(pmap)) {
11524 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11525 	} else {
11526 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11527 	}
11528 
11529 
11530 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11531 
11532 	if (ttep == NULL) {
11533 		return;
11534 	}
11535 
11536 	/* It had better be mapped to the shared page. */
11537 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11538 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11539 	}
11540 
11541 	*ttep = ARM_TTE_EMPTY;
11542 	FLUSH_PTE_STRONG();
11543 
11544 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11545 	sync_tlb_flush();
11546 }
11547 
11548 void
11549 pmap_insert_commpage(
11550 	pmap_t pmap)
11551 {
11552 	kern_return_t kr = KERN_FAILURE;
11553 #if XNU_MONITOR
11554 	do {
11555 		kr = pmap_insert_commpage_ppl(pmap);
11556 
11557 		if (kr == KERN_RESOURCE_SHORTAGE) {
11558 			pmap_alloc_page_for_ppl(0);
11559 		}
11560 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11561 
11562 	pmap_ledger_check_balance(pmap);
11563 #else
11564 	do {
11565 		kr = pmap_insert_commpage_internal(pmap);
11566 	} while (kr == KERN_ABORTED);
11567 #endif
11568 
11569 	if (kr != KERN_SUCCESS) {
11570 		panic("%s: failed to insert the shared page, kr=%d, "
11571 		    "pmap=%p",
11572 		    __FUNCTION__, kr,
11573 		    pmap);
11574 	}
11575 }
11576 
11577 static boolean_t
11578 pmap_is_64bit(
11579 	pmap_t pmap)
11580 {
11581 	return pmap->is_64bit;
11582 }
11583 
11584 bool
11585 pmap_is_exotic(
11586 	pmap_t pmap __unused)
11587 {
11588 	return false;
11589 }
11590 
11591 
11592 /* ARMTODO -- an implementation that accounts for
11593  * holes in the physical map, if any.
11594  */
11595 boolean_t
11596 pmap_valid_page(
11597 	ppnum_t pn)
11598 {
11599 	return pa_valid(ptoa(pn));
11600 }
11601 
11602 boolean_t
11603 pmap_bootloader_page(
11604 	ppnum_t pn)
11605 {
11606 	pmap_paddr_t paddr = ptoa(pn);
11607 
11608 	if (pa_valid(paddr)) {
11609 		return FALSE;
11610 	}
11611 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11612 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11613 }
11614 
11615 MARK_AS_PMAP_TEXT boolean_t
11616 pmap_is_empty_internal(
11617 	pmap_t pmap,
11618 	vm_map_offset_t va_start,
11619 	vm_map_offset_t va_end)
11620 {
11621 	vm_map_offset_t block_start, block_end;
11622 	tt_entry_t *tte_p;
11623 
11624 	if (pmap == NULL) {
11625 		return TRUE;
11626 	}
11627 
11628 	validate_pmap(pmap);
11629 
11630 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11631 	unsigned int initial_not_in_kdp = not_in_kdp;
11632 
11633 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11634 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11635 	}
11636 
11637 
11638 	/* TODO: This will be faster if we increment ttep at each level. */
11639 	block_start = va_start;
11640 
11641 	while (block_start < va_end) {
11642 		pt_entry_t     *bpte_p, *epte_p;
11643 		pt_entry_t     *pte_p;
11644 
11645 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11646 		if (block_end > va_end) {
11647 			block_end = va_end;
11648 		}
11649 
11650 		tte_p = pmap_tte(pmap, block_start);
11651 		if ((tte_p != PT_ENTRY_NULL)
11652 		    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11653 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11654 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11655 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11656 
11657 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11658 				if (*pte_p != ARM_PTE_EMPTY) {
11659 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11660 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11661 					}
11662 					return FALSE;
11663 				}
11664 			}
11665 		}
11666 		block_start = block_end;
11667 	}
11668 
11669 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11670 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11671 	}
11672 
11673 	return TRUE;
11674 }
11675 
11676 boolean_t
11677 pmap_is_empty(
11678 	pmap_t pmap,
11679 	vm_map_offset_t va_start,
11680 	vm_map_offset_t va_end)
11681 {
11682 #if XNU_MONITOR
11683 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11684 #else
11685 	return pmap_is_empty_internal(pmap, va_start, va_end);
11686 #endif
11687 }
11688 
11689 vm_map_offset_t
11690 pmap_max_offset(
11691 	boolean_t               is64,
11692 	unsigned int    option)
11693 {
11694 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11695 }
11696 
11697 vm_map_offset_t
11698 pmap_max_64bit_offset(
11699 	__unused unsigned int option)
11700 {
11701 	vm_map_offset_t max_offset_ret = 0;
11702 
11703 #if defined(__arm64__)
11704 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11705 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11706 		max_offset_ret = arm64_pmap_max_offset_default;
11707 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11708 		max_offset_ret = min_max_offset;
11709 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11710 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11711 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11712 		if (arm64_pmap_max_offset_default) {
11713 			max_offset_ret = arm64_pmap_max_offset_default;
11714 		} else if (max_mem > 0xC0000000) {
11715 			// devices with > 3GB of memory
11716 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11717 		} else if (max_mem > 0x40000000) {
11718 			// devices with > 1GB and <= 3GB of memory
11719 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11720 		} else {
11721 			// devices with <= 1 GB of memory
11722 			max_offset_ret = min_max_offset;
11723 		}
11724 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11725 		if (arm64_pmap_max_offset_default) {
11726 			// Allow the boot-arg to override jumbo size
11727 			max_offset_ret = arm64_pmap_max_offset_default;
11728 		} else {
11729 			max_offset_ret = MACH_VM_MAX_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11730 		}
11731 	} else {
11732 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11733 	}
11734 
11735 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11736 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11737 		assert(max_offset_ret >= min_max_offset);
11738 	}
11739 #else
11740 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11741 #endif
11742 
11743 	return max_offset_ret;
11744 }
11745 
11746 vm_map_offset_t
11747 pmap_max_32bit_offset(
11748 	unsigned int option)
11749 {
11750 	vm_map_offset_t max_offset_ret = 0;
11751 
11752 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11753 		max_offset_ret = arm_pmap_max_offset_default;
11754 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11755 		max_offset_ret = VM_MAX_ADDRESS;
11756 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11757 		max_offset_ret = VM_MAX_ADDRESS;
11758 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11759 		if (arm_pmap_max_offset_default) {
11760 			max_offset_ret = arm_pmap_max_offset_default;
11761 		} else if (max_mem > 0x20000000) {
11762 			max_offset_ret = VM_MAX_ADDRESS;
11763 		} else {
11764 			max_offset_ret = VM_MAX_ADDRESS;
11765 		}
11766 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11767 		max_offset_ret = VM_MAX_ADDRESS;
11768 	} else {
11769 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11770 	}
11771 
11772 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11773 	return max_offset_ret;
11774 }
11775 
11776 #if CONFIG_DTRACE
11777 /*
11778  * Constrain DTrace copyin/copyout actions
11779  */
11780 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11781 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11782 
11783 kern_return_t
11784 dtrace_copyio_preflight(
11785 	__unused addr64_t va)
11786 {
11787 	if (current_map() == kernel_map) {
11788 		return KERN_FAILURE;
11789 	} else {
11790 		return KERN_SUCCESS;
11791 	}
11792 }
11793 
11794 kern_return_t
11795 dtrace_copyio_postflight(
11796 	__unused addr64_t va)
11797 {
11798 	return KERN_SUCCESS;
11799 }
11800 #endif /* CONFIG_DTRACE */
11801 
11802 
11803 void
11804 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11805 {
11806 }
11807 
11808 
11809 void
11810 pmap_flush(
11811 	__unused pmap_flush_context *cpus_to_flush)
11812 {
11813 	/* not implemented yet */
11814 	return;
11815 }
11816 
11817 #if XNU_MONITOR
11818 
11819 /*
11820  * Enforce that the address range described by kva and nbytes is not currently
11821  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
11822  * unintentionally writing to PPL-owned memory.
11823  */
11824 void
11825 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11826 {
11827 	vm_offset_t end;
11828 	if (os_add_overflow(kva, nbytes, &end)) {
11829 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11830 	}
11831 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11832 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11833 		unsigned int pai = pa_index(pa);
11834 		pp_attr_t attr;
11835 		if (__improbable(!pa_valid(pa))) {
11836 			panic("%s(%s): attempt to pin mapping of non-managed page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11837 		}
11838 		pvh_lock(pai);
11839 		if (__improbable(ckva == phystokv(pa))) {
11840 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11841 		}
11842 		do {
11843 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11844 			if (__improbable(attr & PP_ATTR_MONITOR)) {
11845 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11846 			}
11847 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11848 		pvh_unlock(pai);
11849 		if (__improbable(kvtophys_nofail(ckva) != pa)) {
11850 			panic("%s(%p): VA no longer mapped to physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11851 		}
11852 	}
11853 }
11854 
11855 void
11856 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11857 {
11858 	vm_offset_t end;
11859 	if (os_add_overflow(kva, nbytes, &end)) {
11860 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11861 	}
11862 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11863 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11864 
11865 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11866 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11867 		}
11868 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11869 		ppattr_pa_clear_no_monitor(pa);
11870 	}
11871 }
11872 
11873 /**
11874  * Lock down a page, making all mappings read-only, and preventing further
11875  * mappings or removal of this particular kva's mapping. Effectively, it makes
11876  * the physical page at kva immutable (see the ppl_writable parameter for an
11877  * exception to this).
11878  *
11879  * @param kva Valid address to any mapping of the physical page to lockdown.
11880  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11881  * @param ppl_writable True if the PPL should still be able to write to the page
11882  *                     using the physical aperture mapping. False will make the
11883  *                     page read-only for both the kernel and PPL in the
11884  *                     physical aperture.
11885  */
11886 
11887 MARK_AS_PMAP_TEXT static void
11888 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11889 {
11890 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11891 }
11892 
11893 /**
11894  * Lock down a page, giving all mappings the specified maximum permissions, and
11895  * preventing further mappings or removal of this particular kva's mapping.
11896  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11897  * parameter for an exception to this).
11898  *
11899  * @param kva Valid address to any mapping of the physical page to lockdown.
11900  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11901  * @param ppl_writable True if the PPL should still be able to write to the page
11902  *                     using the physical aperture mapping. False will make the
11903  *                     page read-only for both the kernel and PPL in the
11904  *                     physical aperture.
11905  * @param prot Maximum permissions to allow in existing alias mappings
11906  */
11907 MARK_AS_PMAP_TEXT static void
11908 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11909 {
11910 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11911 	const unsigned int pai = pa_index(pa);
11912 
11913 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11914 	pvh_lock(pai);
11915 	pv_entry_t **pvh = pai_to_pvh(pai);
11916 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11917 
11918 	if (__improbable(ppattr_pa_test_monitor(pa))) {
11919 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11920 	}
11921 
11922 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11923 		panic("%s: %#lx already locked down/executable (%#llx)",
11924 		    __func__, kva, (uint64_t)pvh_flags);
11925 	}
11926 
11927 
11928 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11929 
11930 	/* Update the physical aperture mapping to prevent kernel write access. */
11931 	const unsigned int new_xprr_perm =
11932 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11933 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11934 
11935 	pvh_unlock(pai);
11936 
11937 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11938 
11939 	/**
11940 	 * Double-check that the mapping didn't change physical addresses before the
11941 	 * LOCKDOWN flag was set (there is a brief window between the above
11942 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11943 	 *
11944 	 * This doesn't solve the ABA problem, but this doesn't have to since once
11945 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
11946 	 * page without the LOCKDOWN flag already set (so any future mappings can
11947 	 * only be RO, and no existing mappings can be removed).
11948 	 */
11949 	if (kvtophys_nofail(kva) != pa) {
11950 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11951 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11952 	}
11953 }
11954 
11955 /**
11956  * Helper for releasing a page from being locked down to the PPL, making it writable to the
11957  * kernel once again.
11958  *
11959  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11960  *       to unlockdown a page that was never locked down, will panic.
11961  *
11962  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
11963  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11964  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11965  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11966  *                     deviation will result in a panic.
11967  */
11968 MARK_AS_PMAP_TEXT static void
11969 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11970 {
11971 	pvh_assert_locked(pai);
11972 	pv_entry_t **pvh = pai_to_pvh(pai);
11973 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11974 
11975 	if (__improbable(!(pvh_flags & lockdown_flag))) {
11976 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11977 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11978 	}
11979 
11980 
11981 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11982 
11983 	/* Restore the pre-lockdown physical aperture mapping permissions. */
11984 	const unsigned int old_xprr_perm =
11985 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11986 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11987 }
11988 
11989 /**
11990  * Release a page from being locked down to the PPL, making it writable to the
11991  * kernel once again.
11992  *
11993  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11994  *       to unlockdown a page that was never locked down, will panic.
11995  *
11996  * @param kva Valid address to any mapping of the physical page to unlockdown.
11997  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11998  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11999  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
12000  *                     deviation will result in a panic.
12001  */
12002 MARK_AS_PMAP_TEXT static void
12003 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12004 {
12005 	const pmap_paddr_t pa = kvtophys_nofail(kva);
12006 	const unsigned int pai = pa_index(pa);
12007 
12008 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12009 	pvh_lock(pai);
12010 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
12011 	pvh_unlock(pai);
12012 }
12013 
12014 #else /* XNU_MONITOR */
12015 
12016 void __unused
12017 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12018 {
12019 }
12020 
12021 void __unused
12022 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12023 {
12024 }
12025 
12026 #endif /* !XNU_MONITOR */
12027 
12028 
12029 MARK_AS_PMAP_TEXT static inline void
12030 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12031 {
12032 #if XNU_MONITOR
12033 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12034 #else
12035 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12036 #endif
12037 }
12038 
12039 MARK_AS_PMAP_TEXT static inline void
12040 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12041 {
12042 #if XNU_MONITOR
12043 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12044 #else
12045 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12046 #endif
12047 }
12048 
12049 /**
12050  * Perform basic validation checks on the destination only and
12051  * corresponding offset/sizes prior to writing to a read only allocation.
12052  *
12053  * @note Should be called before writing to an allocation from the read
12054  * only allocator.
12055  *
12056  * @param zid The ID of the zone the allocation belongs to.
12057  * @param va VA of element being modified (destination).
12058  * @param offset Offset being written to, in the element.
12059  * @param new_data_size Size of modification.
12060  *
12061  */
12062 
12063 MARK_AS_PMAP_TEXT static void
12064 pmap_ro_zone_validate_element_dst(
12065 	zone_id_t           zid,
12066 	vm_offset_t         va,
12067 	vm_offset_t         offset,
12068 	vm_size_t           new_data_size)
12069 {
12070 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12071 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12072 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12073 	}
12074 
12075 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12076 
12077 	/* Check element is from correct zone and properly aligned */
12078 	zone_require_ro(zid, elem_size, (void*)va);
12079 
12080 	if (__improbable(new_data_size > (elem_size - offset))) {
12081 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
12082 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12083 	}
12084 	if (__improbable(offset >= elem_size)) {
12085 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
12086 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12087 	}
12088 }
12089 
12090 
12091 /**
12092  * Perform basic validation checks on the source, destination and
12093  * corresponding offset/sizes prior to writing to a read only allocation.
12094  *
12095  * @note Should be called before writing to an allocation from the read
12096  * only allocator.
12097  *
12098  * @param zid The ID of the zone the allocation belongs to.
12099  * @param va VA of element being modified (destination).
12100  * @param offset Offset being written to, in the element.
12101  * @param new_data Pointer to new data (source).
12102  * @param new_data_size Size of modification.
12103  *
12104  */
12105 
12106 MARK_AS_PMAP_TEXT static void
12107 pmap_ro_zone_validate_element(
12108 	zone_id_t           zid,
12109 	vm_offset_t         va,
12110 	vm_offset_t         offset,
12111 	const vm_offset_t   new_data,
12112 	vm_size_t           new_data_size)
12113 {
12114 	vm_offset_t sum = 0;
12115 
12116 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12117 		panic("%s: Integer addition overflow %p + %lu = %lu",
12118 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12119 	}
12120 
12121 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12122 }
12123 
12124 /**
12125  * Ensure that physical page is locked down before writing to it.
12126  *
12127  * @note Should be called before writing to an allocation from the read
12128  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12129  * ensure that it is called after the modification.
12130  *
12131  *
12132  * @param pa Physical address of the element being modified.
12133  * @param va Virtual address of element being modified.
12134  * @param size Size of the modification.
12135  *
12136  */
12137 
12138 MARK_AS_PMAP_TEXT static void
12139 pmap_ro_zone_lock_phy_page(
12140 	const pmap_paddr_t  pa,
12141 	vm_offset_t         va,
12142 	vm_size_t           size)
12143 {
12144 	if (__improbable(trunc_page(va + size - 1) != trunc_page(va))) {
12145 		panic("%s: va 0x%llx size 0x%llx crosses page boundary",
12146 		    __func__, (unsigned long long)va, (unsigned long long)size);
12147 	}
12148 	const unsigned int pai = pa_index(pa);
12149 	pvh_lock(pai);
12150 
12151 	/* Ensure that the physical page is locked down */
12152 #if XNU_MONITOR
12153 	pv_entry_t **pvh = pai_to_pvh(pai);
12154 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12155 		panic("%s: Physical page not locked down %llx", __func__, pa);
12156 	}
12157 #endif /* XNU_MONITOR */
12158 }
12159 
12160 /**
12161  * Unlock physical page after writing to it.
12162  *
12163  * @note Should be called after writing to an allocation from the read
12164  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12165  * ensure that it has been called prior to the modification.
12166  *
12167  * @param pa Physical address of the element that was modified.
12168  * @param va Virtual address of element that was modified.
12169  * @param size Size of the modification.
12170  *
12171  */
12172 
12173 MARK_AS_PMAP_TEXT static void
12174 pmap_ro_zone_unlock_phy_page(
12175 	const pmap_paddr_t  pa,
12176 	vm_offset_t         va __unused,
12177 	vm_size_t           size __unused)
12178 {
12179 	const unsigned int pai = pa_index(pa);
12180 	pvh_unlock(pai);
12181 }
12182 
12183 /**
12184  * Function to copy kauth_cred from new_data to kv.
12185  * Function defined in "kern_prot.c"
12186  *
12187  * @note Will be removed upon completion of
12188  * <rdar://problem/72635194> Compiler PAC support for memcpy.
12189  *
12190  * @param kv Address to copy new data to.
12191  * @param new_data Pointer to new data.
12192  *
12193  */
12194 
12195 extern void
12196 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12197 
12198 /**
12199  * Zalloc-specific memcpy that writes through the physical aperture
12200  * and ensures the element being modified is from a read-only zone.
12201  *
12202  * @note Designed to work only with the zone allocator's read-only submap.
12203  *
12204  * @param zid The ID of the zone to allocate from.
12205  * @param va VA of element to be modified.
12206  * @param offset Offset from element.
12207  * @param new_data Pointer to new data.
12208  * @param new_data_size	Size of modification.
12209  *
12210  */
12211 
12212 void
12213 pmap_ro_zone_memcpy(
12214 	zone_id_t           zid,
12215 	vm_offset_t         va,
12216 	vm_offset_t         offset,
12217 	const vm_offset_t   new_data,
12218 	vm_size_t           new_data_size)
12219 {
12220 #if XNU_MONITOR
12221 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12222 #else /* XNU_MONITOR */
12223 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12224 #endif /* XNU_MONITOR */
12225 }
12226 
12227 MARK_AS_PMAP_TEXT void
12228 pmap_ro_zone_memcpy_internal(
12229 	zone_id_t             zid,
12230 	vm_offset_t           va,
12231 	vm_offset_t           offset,
12232 	const vm_offset_t     new_data,
12233 	vm_size_t             new_data_size)
12234 {
12235 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12236 
12237 	if (!new_data || new_data_size == 0) {
12238 		return;
12239 	}
12240 
12241 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12242 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12243 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12244 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12245 }
12246 
12247 /**
12248  * Zalloc-specific function to atomically mutate fields of an element that
12249  * belongs to a read-only zone, via the physcial aperture.
12250  *
12251  * @note Designed to work only with the zone allocator's read-only submap.
12252  *
12253  * @param zid The ID of the zone the element belongs to.
12254  * @param va VA of element to be modified.
12255  * @param offset Offset in element.
12256  * @param op Atomic operation to perform.
12257  * @param value	Mutation value.
12258  *
12259  */
12260 
12261 uint64_t
12262 pmap_ro_zone_atomic_op(
12263 	zone_id_t             zid,
12264 	vm_offset_t           va,
12265 	vm_offset_t           offset,
12266 	zro_atomic_op_t       op,
12267 	uint64_t              value)
12268 {
12269 #if XNU_MONITOR
12270 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12271 #else /* XNU_MONITOR */
12272 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12273 #endif /* XNU_MONITOR */
12274 }
12275 
12276 MARK_AS_PMAP_TEXT uint64_t
12277 pmap_ro_zone_atomic_op_internal(
12278 	zone_id_t             zid,
12279 	vm_offset_t           va,
12280 	vm_offset_t           offset,
12281 	zro_atomic_op_t       op,
12282 	uint64_t              value)
12283 {
12284 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12285 	vm_size_t value_size = op & 0xf;
12286 
12287 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12288 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
12289 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12290 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12291 
12292 	return value;
12293 }
12294 
12295 /**
12296  * bzero for allocations from read only zones, that writes through the
12297  * physical aperture.
12298  *
12299  * @note This is called by the zfree path of all allocations from read
12300  * only zones.
12301  *
12302  * @param zid The ID of the zone the allocation belongs to.
12303  * @param va VA of element to be zeroed.
12304  * @param offset Offset in the element.
12305  * @param size	Size of allocation.
12306  *
12307  */
12308 
12309 void
12310 pmap_ro_zone_bzero(
12311 	zone_id_t       zid,
12312 	vm_offset_t     va,
12313 	vm_offset_t     offset,
12314 	vm_size_t       size)
12315 {
12316 #if XNU_MONITOR
12317 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12318 #else /* XNU_MONITOR */
12319 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
12320 #endif /* XNU_MONITOR */
12321 }
12322 
12323 MARK_AS_PMAP_TEXT void
12324 pmap_ro_zone_bzero_internal(
12325 	zone_id_t       zid,
12326 	vm_offset_t     va,
12327 	vm_offset_t     offset,
12328 	vm_size_t       size)
12329 {
12330 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12331 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12332 	pmap_ro_zone_lock_phy_page(pa, va, size);
12333 	bzero((void*)phystokv(pa), size);
12334 	pmap_ro_zone_unlock_phy_page(pa, va, size);
12335 }
12336 
12337 /**
12338  * Removes write access from the Physical Aperture.
12339  *
12340  * @note For non-PPL devices, it simply makes all virtual mappings RO.
12341  * @note Designed to work only with the zone allocator's read-only submap.
12342  *
12343  * @param va VA of the page to restore write access to.
12344  *
12345  */
12346 MARK_AS_PMAP_TEXT static void
12347 pmap_phys_write_disable(vm_address_t va)
12348 {
12349 #if XNU_MONITOR
12350 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12351 #else /* XNU_MONITOR */
12352 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12353 #endif /* XNU_MONITOR */
12354 }
12355 
12356 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
12357 
12358 MARK_AS_PMAP_TEXT mach_vm_size_t
12359 pmap_query_resident_internal(
12360 	pmap_t                  pmap,
12361 	vm_map_address_t        start,
12362 	vm_map_address_t        end,
12363 	mach_vm_size_t          *compressed_bytes_p)
12364 {
12365 	mach_vm_size_t  resident_bytes = 0;
12366 	mach_vm_size_t  compressed_bytes = 0;
12367 
12368 	pt_entry_t     *bpte, *epte;
12369 	pt_entry_t     *pte_p;
12370 	tt_entry_t     *tte_p;
12371 
12372 	if (pmap == NULL) {
12373 		return PMAP_RESIDENT_INVALID;
12374 	}
12375 
12376 	validate_pmap(pmap);
12377 
12378 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12379 
12380 	/* Ensure that this request is valid, and addresses exactly one TTE. */
12381 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12382 	    (end % pt_attr_page_size(pt_attr)))) {
12383 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12384 	}
12385 
12386 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12387 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12388 	}
12389 
12390 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12391 	tte_p = pmap_tte(pmap, start);
12392 	if (tte_p == (tt_entry_t *) NULL) {
12393 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
12394 		return PMAP_RESIDENT_INVALID;
12395 	}
12396 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12397 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
12398 		bpte = &pte_p[pte_index(pt_attr, start)];
12399 		epte = &pte_p[pte_index(pt_attr, end)];
12400 
12401 		for (; bpte < epte; bpte++) {
12402 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12403 				compressed_bytes += pt_attr_page_size(pt_attr);
12404 			} else if (pa_valid(pte_to_pa(*bpte))) {
12405 				resident_bytes += pt_attr_page_size(pt_attr);
12406 			}
12407 		}
12408 	}
12409 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12410 
12411 	if (compressed_bytes_p) {
12412 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12413 		*compressed_bytes_p += compressed_bytes;
12414 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12415 	}
12416 
12417 	return resident_bytes;
12418 }
12419 
12420 mach_vm_size_t
12421 pmap_query_resident(
12422 	pmap_t                  pmap,
12423 	vm_map_address_t        start,
12424 	vm_map_address_t        end,
12425 	mach_vm_size_t          *compressed_bytes_p)
12426 {
12427 	mach_vm_size_t          total_resident_bytes;
12428 	mach_vm_size_t          compressed_bytes;
12429 	vm_map_address_t        va;
12430 
12431 
12432 	if (pmap == PMAP_NULL) {
12433 		if (compressed_bytes_p) {
12434 			*compressed_bytes_p = 0;
12435 		}
12436 		return 0;
12437 	}
12438 
12439 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12440 
12441 	total_resident_bytes = 0;
12442 	compressed_bytes = 0;
12443 
12444 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12445 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12446 	    VM_KERNEL_ADDRHIDE(end));
12447 
12448 	va = start;
12449 	while (va < end) {
12450 		vm_map_address_t l;
12451 		mach_vm_size_t resident_bytes;
12452 
12453 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12454 
12455 		if (l > end) {
12456 			l = end;
12457 		}
12458 #if XNU_MONITOR
12459 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12460 #else
12461 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12462 #endif
12463 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12464 			break;
12465 		}
12466 
12467 		total_resident_bytes += resident_bytes;
12468 
12469 		va = l;
12470 	}
12471 
12472 	if (compressed_bytes_p) {
12473 		*compressed_bytes_p = compressed_bytes;
12474 	}
12475 
12476 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12477 	    total_resident_bytes);
12478 
12479 	return total_resident_bytes;
12480 }
12481 
12482 #if MACH_ASSERT
12483 static void
12484 pmap_check_ledgers(
12485 	pmap_t pmap)
12486 {
12487 	int     pid;
12488 	char    *procname;
12489 
12490 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12491 		/*
12492 		 * This pmap was not or is no longer fully associated
12493 		 * with a task (e.g. the old pmap after a fork()/exec() or
12494 		 * spawn()).  Its "ledger" still points at a task that is
12495 		 * now using a different (and active) address space, so
12496 		 * we can't check that all the pmap ledgers are balanced here.
12497 		 *
12498 		 * If the "pid" is set, that means that we went through
12499 		 * pmap_set_process() in task_terminate_internal(), so
12500 		 * this task's ledger should not have been re-used and
12501 		 * all the pmap ledgers should be back to 0.
12502 		 */
12503 		return;
12504 	}
12505 
12506 	pid = pmap->pmap_pid;
12507 	procname = pmap->pmap_procname;
12508 
12509 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12510 }
12511 #endif /* MACH_ASSERT */
12512 
12513 void
12514 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12515 {
12516 }
12517 
12518 /**
12519  * The minimum shared region nesting size is used by the VM to determine when to
12520  * break up large mappings to nested regions. The smallest size that these
12521  * mappings can be broken into is determined by what page table level those
12522  * regions are being nested in at and the size of the page tables.
12523  *
12524  * For instance, if a nested region is nesting at L2 for a process utilizing
12525  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12526  * block entry).
12527  *
12528  * @param pmap The target pmap to determine the block size based on whether it's
12529  *             using 16KB or 4KB page tables.
12530  */
12531 uint64_t
12532 pmap_shared_region_size_min(__unused pmap_t pmap)
12533 {
12534 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12535 
12536 	/**
12537 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12538 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12539 	 * point to shared L3 page tables in the shared region pmap.
12540 	 */
12541 	return pt_attr_twig_size(pt_attr);
12542 }
12543 
12544 boolean_t
12545 pmap_enforces_execute_only(
12546 	pmap_t pmap)
12547 {
12548 	return pmap != kernel_pmap;
12549 }
12550 
12551 MARK_AS_PMAP_TEXT void
12552 pmap_set_vm_map_cs_enforced_internal(
12553 	pmap_t pmap,
12554 	bool new_value)
12555 {
12556 	validate_pmap_mutable(pmap);
12557 	pmap->pmap_vm_map_cs_enforced = new_value;
12558 }
12559 
12560 void
12561 pmap_set_vm_map_cs_enforced(
12562 	pmap_t pmap,
12563 	bool new_value)
12564 {
12565 #if XNU_MONITOR
12566 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12567 #else
12568 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12569 #endif
12570 }
12571 
12572 extern int cs_process_enforcement_enable;
12573 bool
12574 pmap_get_vm_map_cs_enforced(
12575 	pmap_t pmap)
12576 {
12577 	if (cs_process_enforcement_enable) {
12578 		return true;
12579 	}
12580 	return pmap->pmap_vm_map_cs_enforced;
12581 }
12582 
12583 MARK_AS_PMAP_TEXT void
12584 pmap_set_jit_entitled_internal(
12585 	__unused pmap_t pmap)
12586 {
12587 	return;
12588 }
12589 
12590 void
12591 pmap_set_jit_entitled(
12592 	pmap_t pmap)
12593 {
12594 #if XNU_MONITOR
12595 	pmap_set_jit_entitled_ppl(pmap);
12596 #else
12597 	pmap_set_jit_entitled_internal(pmap);
12598 #endif
12599 }
12600 
12601 bool
12602 pmap_get_jit_entitled(
12603 	__unused pmap_t pmap)
12604 {
12605 	return false;
12606 }
12607 
12608 MARK_AS_PMAP_TEXT void
12609 pmap_set_tpro_internal(
12610 	__unused pmap_t pmap)
12611 {
12612 	return;
12613 }
12614 
12615 void
12616 pmap_set_tpro(
12617 	pmap_t pmap)
12618 {
12619 #if XNU_MONITOR
12620 	pmap_set_tpro_ppl(pmap);
12621 #else /* XNU_MONITOR */
12622 	pmap_set_tpro_internal(pmap);
12623 #endif /* XNU_MONITOR */
12624 }
12625 
12626 bool
12627 pmap_get_tpro(
12628 	__unused pmap_t pmap)
12629 {
12630 	return false;
12631 }
12632 
12633 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12634 
12635 MARK_AS_PMAP_TEXT kern_return_t
12636 pmap_query_page_info_internal(
12637 	pmap_t          pmap,
12638 	vm_map_offset_t va,
12639 	int             *disp_p)
12640 {
12641 	pmap_paddr_t    pa;
12642 	int             disp;
12643 	unsigned int    pai;
12644 	pt_entry_t      *pte_p, pte;
12645 	pv_entry_t      **pv_h, *pve_p;
12646 
12647 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12648 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12649 		*disp_p = 0;
12650 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12651 		return KERN_INVALID_ARGUMENT;
12652 	}
12653 
12654 	validate_pmap(pmap);
12655 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12656 
12657 try_again:
12658 	disp = 0;
12659 	pte_p = pmap_pte(pmap, va);
12660 	if (pte_p == PT_ENTRY_NULL) {
12661 		goto done;
12662 	}
12663 	pte = *(volatile pt_entry_t*)pte_p;
12664 	pa = pte_to_pa(pte);
12665 	if (pa == 0) {
12666 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12667 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12668 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12669 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12670 			}
12671 		}
12672 	} else {
12673 		disp |= PMAP_QUERY_PAGE_PRESENT;
12674 		pai = pa_index(pa);
12675 		if (!pa_valid(pa)) {
12676 			goto done;
12677 		}
12678 		pvh_lock(pai);
12679 		if (pte != *(volatile pt_entry_t*)pte_p) {
12680 			/* something changed: try again */
12681 			pvh_unlock(pai);
12682 			pmap_query_page_info_retries++;
12683 			goto try_again;
12684 		}
12685 		pv_h = pai_to_pvh(pai);
12686 		pve_p = PV_ENTRY_NULL;
12687 		int pve_ptep_idx = 0;
12688 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12689 			pve_p = pvh_pve_list(pv_h);
12690 			while (pve_p != PV_ENTRY_NULL &&
12691 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12692 				pve_p = pve_next(pve_p);
12693 			}
12694 		}
12695 
12696 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12697 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12698 		} else if (ppattr_test_reusable(pai)) {
12699 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12700 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12701 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12702 		}
12703 		pvh_unlock(pai);
12704 	}
12705 
12706 done:
12707 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12708 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12709 	*disp_p = disp;
12710 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12711 	return KERN_SUCCESS;
12712 }
12713 
12714 kern_return_t
12715 pmap_query_page_info(
12716 	pmap_t          pmap,
12717 	vm_map_offset_t va,
12718 	int             *disp_p)
12719 {
12720 #if XNU_MONITOR
12721 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12722 #else
12723 	return pmap_query_page_info_internal(pmap, va, disp_p);
12724 #endif
12725 }
12726 
12727 
12728 
12729 uint32_t
12730 pmap_user_va_bits(pmap_t pmap __unused)
12731 {
12732 #if __ARM_MIXED_PAGE_SIZE__
12733 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12734 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12735 #else
12736 	return 64 - T0SZ_BOOT;
12737 #endif
12738 }
12739 
12740 uint32_t
12741 pmap_kernel_va_bits(void)
12742 {
12743 	return 64 - T1SZ_BOOT;
12744 }
12745 
12746 static vm_map_size_t
12747 pmap_user_va_size(pmap_t pmap)
12748 {
12749 	return 1ULL << pmap_user_va_bits(pmap);
12750 }
12751 
12752 
12753 
12754 bool
12755 pmap_in_ppl(void)
12756 {
12757 	// Unsupported
12758 	return false;
12759 }
12760 
12761 __attribute__((__noreturn__))
12762 void
12763 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12764 {
12765 	panic("%s called on an unsupported platform.", __FUNCTION__);
12766 }
12767 
12768 void *
12769 pmap_claim_reserved_ppl_page(void)
12770 {
12771 	// Unsupported
12772 	return NULL;
12773 }
12774 
12775 void
12776 pmap_free_reserved_ppl_page(void __unused *kva)
12777 {
12778 	// Unsupported
12779 }
12780 
12781 
12782 #if PMAP_CS_PPL_MONITOR
12783 
12784 /* Immutable part of the trust cache runtime */
12785 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12786 
12787 /* Mutable part of the trust cache runtime */
12788 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12789 
12790 /* Lock for the trust cache runtime */
12791 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12792 
12793 MARK_AS_PMAP_TEXT kern_return_t
12794 pmap_check_trust_cache_runtime_for_uuid_internal(
12795 	const uint8_t check_uuid[kUUIDSize])
12796 {
12797 	kern_return_t ret = KERN_DENIED;
12798 
12799 	if (amfi->TrustCache.version < 3) {
12800 		/* AMFI change hasn't landed in the build */
12801 		pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12802 		return KERN_NOT_SUPPORTED;
12803 	}
12804 
12805 	/* Lock the runtime as shared */
12806 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12807 
12808 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12809 		&ppl_trust_cache_rt,
12810 		check_uuid,
12811 		NULL);
12812 
12813 	/* Unlock the runtime */
12814 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12815 
12816 	if (tc_ret.error == kTCReturnSuccess) {
12817 		ret = KERN_SUCCESS;
12818 	} else if (tc_ret.error == kTCReturnNotFound) {
12819 		ret = KERN_NOT_FOUND;
12820 	} else {
12821 		ret = KERN_FAILURE;
12822 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12823 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12824 	}
12825 
12826 	return ret;
12827 }
12828 
12829 kern_return_t
12830 pmap_check_trust_cache_runtime_for_uuid(
12831 	const uint8_t check_uuid[kUUIDSize])
12832 {
12833 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12834 }
12835 
12836 MARK_AS_PMAP_TEXT kern_return_t
12837 pmap_load_trust_cache_with_type_internal(
12838 	TCType_t type,
12839 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12840 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12841 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12842 {
12843 	kern_return_t ret = KERN_DENIED;
12844 	pmap_img4_payload_t *payload = NULL;
12845 	size_t img4_payload_len = 0;
12846 	size_t payload_len_aligned = 0;
12847 	size_t manifest_len_aligned = 0;
12848 
12849 	/* Ignore the auxiliary manifest until we add support for it */
12850 	(void)img4_aux_manifest;
12851 	(void)img4_aux_manifest_len;
12852 
12853 
12854 #if PMAP_CS_INCLUDE_CODE_SIGNING
12855 	if (pmap_cs) {
12856 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12857 			panic("trust cache type not loadable from interface: %u", type);
12858 		} else if (type >= kTCTypeTotal) {
12859 			panic("attempted to load an unsupported trust cache type: %u", type);
12860 		}
12861 
12862 		/* Validate entitlement for the calling process */
12863 		if (TCTypeConfig[type].entitlementValue != NULL) {
12864 			const bool entitlement_satisfied = check_entitlement_pmap(
12865 				NULL,
12866 				"com.apple.private.pmap.load-trust-cache",
12867 				TCTypeConfig[type].entitlementValue,
12868 				false,
12869 				true);
12870 
12871 			if (entitlement_satisfied == false) {
12872 				panic("attempted to load trust cache without entitlement: %u", type);
12873 			}
12874 		}
12875 	}
12876 #endif
12877 
12878 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12879 	ret = pmap_reserve_ppl_page();
12880 	if (ret != KERN_SUCCESS) {
12881 		if (ret != KERN_RESOURCE_SHORTAGE) {
12882 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12883 		}
12884 		return ret;
12885 	}
12886 
12887 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
12888 	payload_len_aligned = round_page(pmap_img4_payload_len);
12889 	manifest_len_aligned = round_page(img4_manifest_len);
12890 
12891 	/* Ensure we have valid data passed in */
12892 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12893 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12894 
12895 	/*
12896 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12897 	 * data structure used by libTrustCache to manage the payload. We need to be able to
12898 	 * write to that data structure, so we keep the payload PPL writable.
12899 	 */
12900 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12901 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12902 
12903 	/* Should be safe to read from this now */
12904 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
12905 
12906 	/* Acquire a writable version of the trust cache data structure */
12907 	TrustCache_t *trust_cache = &payload->trust_cache;
12908 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12909 
12910 	/* Calculate the correct length of the img4 payload */
12911 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12912 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12913 	}
12914 
12915 	/* Exclusively lock the runtime */
12916 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12917 
12918 	/* Load the trust cache */
12919 	TCReturn_t tc_ret = amfi->TrustCache.load(
12920 		&ppl_trust_cache_rt,
12921 		type,
12922 		trust_cache,
12923 		(const uintptr_t)payload->img4_payload, img4_payload_len,
12924 		(const uintptr_t)img4_manifest, img4_manifest_len);
12925 
12926 	/* Unlock the runtime */
12927 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12928 
12929 	if (tc_ret.error == kTCReturnSuccess) {
12930 		ret = KERN_SUCCESS;
12931 	} else {
12932 		if (tc_ret.error == kTCReturnDuplicate) {
12933 			ret = KERN_ALREADY_IN_SET;
12934 		} else {
12935 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12936 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12937 
12938 			ret = KERN_FAILURE;
12939 		}
12940 
12941 		/* Unlock the payload data */
12942 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12943 		trust_cache = NULL;
12944 		payload = NULL;
12945 	}
12946 
12947 	/* Unlock the manifest since it is no longer needed */
12948 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12949 
12950 	/* Return the CoreCrypto reserved page back to the free list */
12951 	pmap_release_reserved_ppl_page();
12952 
12953 	return ret;
12954 }
12955 
12956 kern_return_t
12957 pmap_load_trust_cache_with_type(
12958 	TCType_t type,
12959 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12960 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12961 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12962 {
12963 	kern_return_t ret = KERN_DENIED;
12964 
12965 	ret = pmap_load_trust_cache_with_type_ppl(
12966 		type,
12967 		pmap_img4_payload, pmap_img4_payload_len,
12968 		img4_manifest, img4_manifest_len,
12969 		img4_aux_manifest, img4_aux_manifest_len);
12970 
12971 	while (ret == KERN_RESOURCE_SHORTAGE) {
12972 		/* Allocate a page from the free list */
12973 		pmap_alloc_page_for_ppl(0);
12974 
12975 		/* Attempt the call again */
12976 		ret = pmap_load_trust_cache_with_type_ppl(
12977 			type,
12978 			pmap_img4_payload, pmap_img4_payload_len,
12979 			img4_manifest, img4_manifest_len,
12980 			img4_aux_manifest, img4_aux_manifest_len);
12981 	}
12982 
12983 	return ret;
12984 }
12985 
12986 MARK_AS_PMAP_TEXT kern_return_t
12987 pmap_query_trust_cache_safe(
12988 	TCQueryType_t query_type,
12989 	const uint8_t cdhash[kTCEntryHashSize],
12990 	TrustCacheQueryToken_t *query_token)
12991 {
12992 	kern_return_t ret = KERN_NOT_FOUND;
12993 
12994 	/* Validate the query type preemptively */
12995 	if (query_type >= kTCQueryTypeTotal) {
12996 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12997 		return KERN_INVALID_ARGUMENT;
12998 	}
12999 
13000 	/* Lock the runtime as shared */
13001 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13002 
13003 	TCReturn_t tc_ret = amfi->TrustCache.query(
13004 		&ppl_trust_cache_rt,
13005 		query_type,
13006 		cdhash,
13007 		query_token);
13008 
13009 	/* Unlock the runtime */
13010 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13011 
13012 	if (tc_ret.error == kTCReturnSuccess) {
13013 		ret = KERN_SUCCESS;
13014 	} else if (tc_ret.error == kTCReturnNotFound) {
13015 		ret = KERN_NOT_FOUND;
13016 	} else {
13017 		ret = KERN_FAILURE;
13018 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13019 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13020 	}
13021 
13022 	return ret;
13023 }
13024 
13025 MARK_AS_PMAP_TEXT kern_return_t
13026 pmap_query_trust_cache_internal(
13027 	TCQueryType_t query_type,
13028 	const uint8_t cdhash[kTCEntryHashSize],
13029 	TrustCacheQueryToken_t *query_token)
13030 {
13031 	kern_return_t ret = KERN_NOT_FOUND;
13032 	TrustCacheQueryToken_t query_token_safe = {0};
13033 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13034 
13035 	/* Copy in the CDHash into PPL storage */
13036 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13037 
13038 	/* Query through the safe API since we're in the PPL now */
13039 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13040 
13041 	if (query_token != NULL) {
13042 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13043 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13044 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13045 	}
13046 
13047 	return ret;
13048 }
13049 
13050 kern_return_t
13051 pmap_query_trust_cache(
13052 	TCQueryType_t query_type,
13053 	const uint8_t cdhash[kTCEntryHashSize],
13054 	TrustCacheQueryToken_t *query_token)
13055 {
13056 	kern_return_t ret = KERN_NOT_FOUND;
13057 
13058 	ret = pmap_query_trust_cache_ppl(
13059 		query_type,
13060 		cdhash,
13061 		query_token);
13062 
13063 	return ret;
13064 }
13065 
13066 MARK_AS_PMAP_DATA bool ppl_developer_mode_set =  false;
13067 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13068 
13069 MARK_AS_PMAP_TEXT void
13070 pmap_toggle_developer_mode_internal(
13071 	bool state)
13072 {
13073 	bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
13074 
13075 	/*
13076 	 * Only the following state transitions are allowed:
13077 	 * -- not set --> false
13078 	 * -- not set --> true
13079 	 * -- true --> false
13080 	 * -- true --> true
13081 	 * -- false --> false
13082 	 *
13083 	 * We never allow false --> true transitions.
13084 	 */
13085 	bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
13086 
13087 	if ((current == false) && (state == true) && state_set) {
13088 		panic("PMAP_CS: attempted to enable developer mode incorrectly");
13089 	}
13090 
13091 	/* We're going to update the developer mode state, so update this first */
13092 	os_atomic_store(&ppl_developer_mode_set, true, relaxed);
13093 
13094 	/* Update the developer mode state on the system */
13095 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13096 }
13097 
13098 void
13099 pmap_toggle_developer_mode(
13100 	bool state)
13101 {
13102 	pmap_toggle_developer_mode_ppl(state);
13103 }
13104 
13105 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13106 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13107 
13108 #pragma mark Image4 - New
13109 
13110 typedef struct _pmap_image4_dispatch {
13111 	image4_cs_trap_t selector;
13112 	image4_cs_trap_handler_t handler;
13113 } pmap_image4_dispatch_t;
13114 
13115 MARK_AS_PMAP_TEXT static errno_t
13116 _pmap_image4_monitor_trap_set_release_type(
13117 	const pmap_image4_dispatch_t *dispatch,
13118 	const void *input_data)
13119 {
13120 	/*
13121 	 * csmx_release_type --> __cs_copy
13122 	 */
13123 	image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13124 
13125 	/* Copy the input data to prevent ToCToU */
13126 	memcpy(&input, input_data, sizeof(input));
13127 
13128 	/* Dispatch to AppleImage4 */
13129 	return dispatch->handler(
13130 		dispatch->selector,
13131 		&input, sizeof(input),
13132 		NULL, NULL);
13133 }
13134 
13135 
13136 
13137 MARK_AS_PMAP_TEXT static errno_t
13138 _pmap_image4_monitor_trap_nonce_set(
13139 	const pmap_image4_dispatch_t *dispatch,
13140 	const void *input_data)
13141 {
13142 	/*
13143 	 * csmx_clear --> __cs_copy
13144 	 * csmx_cipher --> __cs_copy
13145 	 */
13146 	image4_cs_trap_argv_nonce_set_t input = {0};
13147 
13148 	/* Copy the input data to prevent ToCToU */
13149 	memcpy(&input, input_data, sizeof(input));
13150 
13151 	/* Dispatch to AppleImage4 */
13152 	return dispatch->handler(
13153 		dispatch->selector,
13154 		&input, sizeof(input),
13155 		NULL, NULL);
13156 }
13157 
13158 MARK_AS_PMAP_TEXT static errno_t
13159 _pmap_image4_monitor_trap_nonce_roll(
13160 	const pmap_image4_dispatch_t *dispatch,
13161 	const void *input_data)
13162 {
13163 	image4_cs_trap_argv_nonce_roll_t input = {0};
13164 
13165 	/* Copy the input data to prevent ToCToU */
13166 	memcpy(&input, input_data, sizeof(input));
13167 
13168 	/* Dispatch to AppleImage4 */
13169 	return dispatch->handler(
13170 		dispatch->selector,
13171 		&input, sizeof(input),
13172 		NULL, NULL);
13173 }
13174 
13175 MARK_AS_PMAP_TEXT static errno_t
13176 _pmap_image4_monitor_trap_image_activate(
13177 	const pmap_image4_dispatch_t *dispatch,
13178 	const void *input_data)
13179 {
13180 	/*
13181 	 * csmx_payload (csmx_payload_len) --> __cs_xfer
13182 	 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13183 	 */
13184 	image4_cs_trap_argv_image_activate_t input = {0};
13185 
13186 	/* Copy the input data to prevent ToCToU */
13187 	memcpy(&input, input_data, sizeof(input));
13188 
13189 	/* Validate the payload region */
13190 	pmap_cs_assert_addr(
13191 		input.csmx_payload, round_page(input.csmx_payload_len),
13192 		false, false);
13193 
13194 	/* Validate the manifest region */
13195 	pmap_cs_assert_addr(
13196 		input.csmx_manifest, round_page(input.csmx_manifest_len),
13197 		false, false);
13198 
13199 	/* Lockdown the payload region */
13200 	pmap_cs_lockdown_pages(
13201 		input.csmx_payload, round_page(input.csmx_payload_len), false);
13202 
13203 	/* Lockdown the manifest region */
13204 	pmap_cs_lockdown_pages(
13205 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13206 
13207 	/* Dispatch the handler */
13208 	errno_t err = dispatch->handler(
13209 		dispatch->selector,
13210 		&input, sizeof(input),
13211 		NULL, NULL);
13212 
13213 	/*
13214 	 * Image activation always returns the manifest back to the kernel since it isn't
13215 	 * needed once the evaluation of the image has been completed. The payload must
13216 	 * remain owned by the monitor if the activation was successful.
13217 	 */
13218 	if (err != 0) {
13219 		/* Unlock the payload region */
13220 		pmap_cs_unlockdown_pages(
13221 			input.csmx_payload, round_page(input.csmx_payload_len), false);
13222 	}
13223 
13224 	/* Unlock the manifest region */
13225 	pmap_cs_unlockdown_pages(
13226 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13227 
13228 	return err;
13229 }
13230 
13231 MARK_AS_PMAP_TEXT static errno_t
13232 _pmap_image4_monitor_trap_passthrough(
13233 	__unused const pmap_image4_dispatch_t *dispatch,
13234 	__unused const void *input_data,
13235 	__unused size_t input_size)
13236 {
13237 #if DEVELOPMENT || DEBUG || KASAN
13238 	return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13239 #else
13240 	pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13241 	return ENOSYS;
13242 #endif
13243 }
13244 
13245 MARK_AS_PMAP_TEXT errno_t
13246 pmap_image4_monitor_trap_internal(
13247 	image4_cs_trap_t selector,
13248 	const void *input_data,
13249 	size_t input_size)
13250 {
13251 	kern_return_t ret = KERN_DENIED;
13252 	errno_t err = EPERM;
13253 
13254 	/* Acquire the handler for this selector */
13255 	image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13256 	if (handler == NULL) {
13257 		pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13258 		return EINVAL;
13259 	}
13260 
13261 	/* Verify input size for the handler */
13262 	if (input_size != image4_cs_trap_vector_size(selector)) {
13263 		pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13264 		return EINVAL;
13265 	}
13266 
13267 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13268 	ret = pmap_reserve_ppl_page();
13269 	if (ret != KERN_SUCCESS) {
13270 		if (ret == KERN_RESOURCE_SHORTAGE) {
13271 			return ENOMEM;
13272 		}
13273 		pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13274 		return EPERM;
13275 	}
13276 
13277 	/* Setup dispatch parameters */
13278 	pmap_image4_dispatch_t dispatch = {
13279 		.selector = selector,
13280 		.handler = handler
13281 	};
13282 
13283 	switch (selector) {
13284 	case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13285 		err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13286 		break;
13287 
13288 	case IMAGE4_CS_TRAP_NONCE_SET:
13289 		err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13290 		break;
13291 
13292 	case IMAGE4_CS_TRAP_NONCE_ROLL:
13293 		err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13294 		break;
13295 
13296 	case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13297 		err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13298 		break;
13299 
13300 	default:
13301 		err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13302 		break;
13303 	}
13304 
13305 	/* Return the CoreCrypto reserved page back to the free list */
13306 	pmap_release_reserved_ppl_page();
13307 
13308 	return err;
13309 }
13310 
13311 errno_t
13312 pmap_image4_monitor_trap(
13313 	image4_cs_trap_t selector,
13314 	const void *input_data,
13315 	size_t input_size)
13316 {
13317 	errno_t err = EPERM;
13318 
13319 	err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13320 	while (err == ENOMEM) {
13321 		/* Allocate a page from the free list */
13322 		pmap_alloc_page_for_ppl(0);
13323 
13324 		/* Call the monitor dispatch again */
13325 		err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13326 	}
13327 
13328 	return err;
13329 }
13330 
13331 #endif /* PMAP_CS_PPL_MONITOR */
13332 
13333 #if PMAP_CS_INCLUDE_CODE_SIGNING
13334 
13335 static int
13336 pmap_cs_profiles_rbtree_compare(
13337 	void *profile0,
13338 	void *profile1)
13339 {
13340 	if (profile0 < profile1) {
13341 		return -1;
13342 	} else if (profile0 > profile1) {
13343 		return 1;
13344 	}
13345 	return 0;
13346 }
13347 
13348 /* Red-black tree for managing provisioning profiles */
13349 MARK_AS_PMAP_DATA static
13350 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13351 
13352 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13353 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13354 
13355 /* Lock for the profile red-black tree */
13356 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13357 
13358 void
13359 pmap_initialize_provisioning_profiles(void)
13360 {
13361 	/* Initialize the profiles red-black tree lock */
13362 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13363 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13364 
13365 	/* Initialize the red-black tree itself */
13366 	RB_INIT(&pmap_cs_registered_profiles);
13367 
13368 	printf("initialized PPL provisioning profile data\n");
13369 }
13370 
13371 static bool
13372 pmap_is_testflight_profile(
13373 	pmap_cs_profile_t *profile_obj)
13374 {
13375 	const char *entitlement_name = "beta-reports-active";
13376 	const size_t entitlement_length = strlen(entitlement_name);
13377 	CEQueryOperation_t query[2] = {0};
13378 
13379 	/* If the profile provisions no entitlements, then it isn't a test flight one */
13380 	if (profile_obj->entitlements_ctx == NULL) {
13381 		return false;
13382 	}
13383 
13384 	/* Build our CoreEntitlements query */
13385 	query[0].opcode = kCEOpSelectKey;
13386 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13387 	query[0].parameters.stringParameter.length = entitlement_length;
13388 	query[1] = CEMatchBool(true);
13389 
13390 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13391 		profile_obj->entitlements_ctx,
13392 		query, 2);
13393 
13394 	if (ce_err == amfi->CoreEntitlements.kNoError) {
13395 		return true;
13396 	}
13397 
13398 	return false;
13399 }
13400 
13401 static bool
13402 pmap_is_development_profile(
13403 	pmap_cs_profile_t *profile_obj)
13404 {
13405 	/* Check for UPP */
13406 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13407 		*profile_obj->profile_ctx,
13408 		CESelectDictValue("ProvisionsAllDevices"));
13409 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13410 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13411 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13412 			return false;
13413 		}
13414 	}
13415 
13416 	/* Check for TestFlight profile */
13417 	if (pmap_is_testflight_profile(profile_obj) == true) {
13418 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13419 		return false;
13420 	}
13421 
13422 	pmap_cs_log_info("%p: development profile", profile_obj);
13423 	return true;
13424 }
13425 
13426 static kern_return_t
13427 pmap_initialize_profile_entitlements(
13428 	pmap_cs_profile_t *profile_obj)
13429 {
13430 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13431 		*profile_obj->profile_ctx,
13432 		CESelectDictValue("Entitlements"));
13433 
13434 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13435 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13436 		profile_obj->entitlements_ctx = NULL;
13437 
13438 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13439 		return KERN_NOT_FOUND;
13440 	}
13441 
13442 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13443 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13444 
13445 	CEValidationResult ce_result = {0};
13446 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
13447 		pmap_cs_core_entitlements_runtime,
13448 		&ce_result,
13449 		der_start, der_end);
13450 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13451 		pmap_cs_log_error("unable to validate profile entitlements: %s",
13452 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13453 
13454 		return KERN_ABORTED;
13455 	}
13456 
13457 	struct CEQueryContext query_ctx = {0};
13458 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13459 		pmap_cs_core_entitlements_runtime,
13460 		ce_result,
13461 		&query_ctx);
13462 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13463 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13464 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13465 
13466 		return KERN_ABORTED;
13467 	}
13468 
13469 	/* Setup the entitlements context within the profile object */
13470 	profile_obj->entitlements_ctx_storage = query_ctx;
13471 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13472 
13473 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13474 	return KERN_SUCCESS;
13475 }
13476 
13477 kern_return_t
13478 pmap_register_provisioning_profile_internal(
13479 	const vm_address_t payload_addr,
13480 	const vm_size_t payload_size)
13481 {
13482 	kern_return_t ret = KERN_DENIED;
13483 	pmap_cs_profile_t *profile_obj = NULL;
13484 	pmap_profile_payload_t *profile_payload = NULL;
13485 	vm_size_t max_profile_blob_size = 0;
13486 	const uint8_t *profile_content = NULL;
13487 	size_t profile_content_length = 0;
13488 
13489 
13490 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
13491 	ret = pmap_reserve_ppl_page();
13492 	if (ret != KERN_SUCCESS) {
13493 		if (ret != KERN_RESOURCE_SHORTAGE) {
13494 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13495 		}
13496 		return ret;
13497 	}
13498 
13499 	/* Ensure we have valid data passed in */
13500 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13501 
13502 	/*
13503 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
13504 	 * data structure used by the PPL to manage the payload. We need to be able to write
13505 	 * to that data structure, so we keep the payload PPL writable.
13506 	 */
13507 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13508 
13509 	/* Should be safe to read from this now */
13510 	profile_payload = (pmap_profile_payload_t*)payload_addr;
13511 
13512 	/* Ensure the profile blob size provided is valid */
13513 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13514 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13515 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13516 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13517 	}
13518 
13519 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13520 	const bool allow_development_root_cert = true;
13521 #else
13522 	const bool allow_development_root_cert = false;
13523 #endif
13524 
13525 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
13526 		profile_payload->profile_blob, profile_payload->profile_blob_size,
13527 		allow_development_root_cert,
13528 		&profile_content, &profile_content_length);
13529 
13530 	/* Release the PPL page allocated for CoreCrypto */
13531 	pmap_release_reserved_ppl_page();
13532 
13533 	if (ct_result != 0) {
13534 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13535 	} else if ((profile_content == NULL) || profile_content_length == 0) {
13536 		panic("PMAP_CS: profile does not have any content: %p | %lu",
13537 		    profile_content, profile_content_length);
13538 	}
13539 
13540 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13541 		pmap_cs_core_entitlements_runtime,
13542 		CCDER_CONSTRUCTED_SET,
13543 		false,
13544 		profile_content, profile_content + profile_content_length);
13545 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13546 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13547 	}
13548 
13549 	/* Acquire a writable version of the profile data structure */
13550 	profile_obj = &profile_payload->profile_obj_storage;
13551 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13552 
13553 	profile_obj->original_payload = profile_payload;
13554 	profile_obj->profile_ctx_storage = profile_ctx_storage;
13555 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13556 	os_atomic_store(&profile_obj->reference_count, 0, release);
13557 
13558 	/* Setup the entitlements provisioned by the profile */
13559 	ret = pmap_initialize_profile_entitlements(profile_obj);
13560 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13561 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13562 	}
13563 
13564 	/* Setup properties of the profile */
13565 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13566 
13567 	/* Mark as validated since it passed all checks */
13568 	profile_obj->profile_validated = true;
13569 
13570 	/* Add the profile to the red-black tree */
13571 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13572 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13573 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13574 	}
13575 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13576 
13577 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13578 	return KERN_SUCCESS;
13579 }
13580 
13581 kern_return_t
13582 pmap_register_provisioning_profile(
13583 	const vm_address_t payload_addr,
13584 	const vm_size_t payload_size)
13585 {
13586 	kern_return_t ret = KERN_DENIED;
13587 
13588 	ret = pmap_register_provisioning_profile_ppl(
13589 		payload_addr,
13590 		payload_size);
13591 
13592 	while (ret == KERN_RESOURCE_SHORTAGE) {
13593 		/* Allocate a page from the free list */
13594 		pmap_alloc_page_for_ppl(0);
13595 
13596 		/* Attempt the call again */
13597 		ret = pmap_register_provisioning_profile_ppl(
13598 			payload_addr,
13599 			payload_size);
13600 	}
13601 
13602 	return ret;
13603 }
13604 
13605 kern_return_t
13606 pmap_unregister_provisioning_profile_internal(
13607 	pmap_cs_profile_t *profile_obj)
13608 {
13609 	kern_return_t ret = KERN_DENIED;
13610 
13611 	/* Lock the red-black tree exclusively */
13612 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13613 
13614 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13615 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13616 	}
13617 
13618 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13619 	if (reference_count != 0) {
13620 		ret = KERN_FAILURE;
13621 		goto exit;
13622 	}
13623 
13624 	/* Remove the profile from the red-black tree */
13625 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13626 
13627 	/* Unregistration was a success */
13628 	ret = KERN_SUCCESS;
13629 
13630 exit:
13631 	/* Unlock the red-black tree */
13632 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13633 
13634 	if (ret == KERN_SUCCESS) {
13635 		/* Get the original payload address */
13636 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13637 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13638 
13639 		/* Get the original payload size */
13640 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13641 		payload_size = round_page(payload_size);
13642 
13643 		/* Unlock the profile payload */
13644 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13645 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13646 		    profile_payload, payload_size);
13647 
13648 		profile_obj = NULL;
13649 	}
13650 	return ret;
13651 }
13652 
13653 kern_return_t
13654 pmap_unregister_provisioning_profile(
13655 	pmap_cs_profile_t *profile_obj)
13656 {
13657 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13658 }
13659 
13660 kern_return_t
13661 pmap_associate_provisioning_profile_internal(
13662 	pmap_cs_code_directory_t *cd_entry,
13663 	pmap_cs_profile_t *profile_obj)
13664 {
13665 	kern_return_t ret = KERN_DENIED;
13666 
13667 	/* Acquire the lock on the code directory */
13668 	pmap_cs_lock_code_directory(cd_entry);
13669 
13670 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13671 		pmap_cs_log_error("disallowing profile association with verified signature");
13672 		goto exit;
13673 	} else if (cd_entry->profile_obj != NULL) {
13674 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13675 		goto exit;
13676 	}
13677 
13678 	/* Lock the red-black tree as shared */
13679 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13680 
13681 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13682 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13683 	} else if (profile_obj->profile_validated == false) {
13684 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13685 	}
13686 
13687 	/* Associate the profile with the signature */
13688 	cd_entry->profile_obj = profile_obj;
13689 
13690 	/* Increment the reference count on the profile object */
13691 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13692 	if (reference_count == 0) {
13693 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13694 	}
13695 
13696 	/* Unlock the red-black tree */
13697 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13698 
13699 	/* Association was a success */
13700 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13701 	ret = KERN_SUCCESS;
13702 
13703 exit:
13704 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13705 
13706 	return ret;
13707 }
13708 
13709 kern_return_t
13710 pmap_associate_provisioning_profile(
13711 	pmap_cs_code_directory_t *cd_entry,
13712 	pmap_cs_profile_t *profile_obj)
13713 {
13714 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13715 }
13716 
13717 kern_return_t
13718 pmap_disassociate_provisioning_profile_internal(
13719 	pmap_cs_code_directory_t *cd_entry)
13720 {
13721 	pmap_cs_profile_t *profile_obj = NULL;
13722 	kern_return_t ret = KERN_DENIED;
13723 
13724 	/* Acquire the lock on the code directory */
13725 	pmap_cs_lock_code_directory(cd_entry);
13726 
13727 	if (cd_entry->profile_obj == NULL) {
13728 		ret = KERN_NOT_FOUND;
13729 		goto exit;
13730 	}
13731 	profile_obj = cd_entry->profile_obj;
13732 
13733 	/* Disassociate the profile from the signature */
13734 	cd_entry->profile_obj = NULL;
13735 
13736 	/* Disassociation was a success */
13737 	ret = KERN_SUCCESS;
13738 
13739 exit:
13740 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13741 
13742 	if (ret == KERN_SUCCESS) {
13743 		/* Decrement the reference count on the profile object */
13744 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13745 		if (reference_count == UINT32_MAX) {
13746 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13747 		}
13748 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13749 	}
13750 	return ret;
13751 }
13752 
13753 kern_return_t
13754 pmap_disassociate_provisioning_profile(
13755 	pmap_cs_code_directory_t *cd_entry)
13756 {
13757 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13758 }
13759 
13760 kern_return_t
13761 pmap_associate_kernel_entitlements_internal(
13762 	pmap_cs_code_directory_t *cd_entry,
13763 	const void *kernel_entitlements)
13764 {
13765 	kern_return_t ret = KERN_DENIED;
13766 
13767 	if (kernel_entitlements == NULL) {
13768 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13769 	}
13770 
13771 	/* Acquire the lock on the code directory */
13772 	pmap_cs_lock_code_directory(cd_entry);
13773 
13774 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13775 		ret = KERN_DENIED;
13776 		goto out;
13777 	} else if (cd_entry->kernel_entitlements != NULL) {
13778 		ret = KERN_DENIED;
13779 		goto out;
13780 	}
13781 	cd_entry->kernel_entitlements = kernel_entitlements;
13782 
13783 	/* Association was a success */
13784 	ret = KERN_SUCCESS;
13785 
13786 out:
13787 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13788 	return ret;
13789 }
13790 
13791 kern_return_t
13792 pmap_associate_kernel_entitlements(
13793 	pmap_cs_code_directory_t *cd_entry,
13794 	const void *kernel_entitlements)
13795 {
13796 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13797 }
13798 
13799 kern_return_t
13800 pmap_resolve_kernel_entitlements_internal(
13801 	pmap_t pmap,
13802 	const void **kernel_entitlements)
13803 {
13804 	const void *entitlements = NULL;
13805 	pmap_cs_code_directory_t *cd_entry = NULL;
13806 	kern_return_t ret = KERN_DENIED;
13807 
13808 	/* Validate the PMAP object */
13809 	validate_pmap(pmap);
13810 
13811 	/* Ensure no kernel PMAP */
13812 	if (pmap == kernel_pmap) {
13813 		return KERN_NOT_FOUND;
13814 	}
13815 
13816 	/* Attempt a shared lock on the PMAP */
13817 	if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13818 		return KERN_ABORTED;
13819 	}
13820 
13821 	/*
13822 	 * Acquire the code signature from the PMAP. This function is called when
13823 	 * performing an entitlement check, and since we've confirmed this isn't
13824 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13825 	 * with a code signature.
13826 	 */
13827 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13828 	if (cd_entry == NULL) {
13829 		ret = KERN_NOT_FOUND;
13830 		goto out;
13831 	}
13832 
13833 	entitlements = cd_entry->kernel_entitlements;
13834 	if (entitlements == NULL) {
13835 		ret = KERN_NOT_FOUND;
13836 		goto out;
13837 	}
13838 
13839 	/* Pin and write out the entitlements object pointer */
13840 	if (kernel_entitlements != NULL) {
13841 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13842 		*kernel_entitlements = entitlements;
13843 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13844 	}
13845 
13846 	/* Successfully resolved the entitlements */
13847 	ret = KERN_SUCCESS;
13848 
13849 out:
13850 	/* Unlock the code signature object */
13851 	if (cd_entry != NULL) {
13852 		lck_rw_unlock_shared(&cd_entry->rwlock);
13853 		cd_entry = NULL;
13854 	}
13855 
13856 	/* Unlock the PMAP object */
13857 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
13858 
13859 	return ret;
13860 }
13861 
13862 kern_return_t
13863 pmap_resolve_kernel_entitlements(
13864 	pmap_t pmap,
13865 	const void **kernel_entitlements)
13866 {
13867 	kern_return_t ret = KERN_DENIED;
13868 
13869 	do {
13870 		ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13871 	} while (ret == KERN_ABORTED);
13872 
13873 	return ret;
13874 }
13875 
13876 kern_return_t
13877 pmap_accelerate_entitlements_internal(
13878 	pmap_cs_code_directory_t *cd_entry)
13879 {
13880 	const coreentitlements_t *CoreEntitlements = NULL;
13881 	const CS_SuperBlob *superblob = NULL;
13882 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13883 	size_t signature_length = 0;
13884 	size_t acceleration_length = 0;
13885 	size_t required_length = 0;
13886 	kern_return_t ret = KERN_DENIED;
13887 
13888 	/* Setup the CoreEntitlements interface */
13889 	CoreEntitlements = &amfi->CoreEntitlements;
13890 
13891 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13892 
13893 	/* Acquire the lock on the code directory */
13894 	pmap_cs_lock_code_directory(cd_entry);
13895 
13896 	/*
13897 	 * Only reconstituted code signatures can be accelerated. This is only a policy
13898 	 * decision we make since this allows us to re-use any unused space within the
13899 	 * locked down code signature region. There is also a decent bit of validation
13900 	 * within the reconstitution function to ensure blobs are ordered and do not
13901 	 * contain any padding around them which can cause issues here.
13902 	 *
13903 	 * This also serves as a check to ensure the signature is trusted.
13904 	 */
13905 	if (cd_entry->unneeded_code_signature_unlocked == false) {
13906 		ret = KERN_DENIED;
13907 		goto out;
13908 	}
13909 
13910 	if (cd_entry->ce_ctx == NULL) {
13911 		ret = KERN_SUCCESS;
13912 		goto out;
13913 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13914 		ret = KERN_SUCCESS;
13915 		goto out;
13916 	}
13917 
13918 	/* We only support accelerating when size <= PAGE_SIZE */
13919 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13920 	if (ce_err != CoreEntitlements->kNoError) {
13921 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13922 			/* Small entitlement blobs aren't eligible */
13923 			ret = KERN_SUCCESS;
13924 			goto out;
13925 		}
13926 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13927 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13928 	} else if (acceleration_length > PAGE_SIZE) {
13929 		ret = KERN_ABORTED;
13930 		goto out;
13931 	}
13932 	assert(acceleration_length > 0);
13933 
13934 	superblob = cd_entry->superblob;
13935 	signature_length = ntohl(superblob->length);
13936 
13937 	/* Adjust the required length for the overhead structure -- can't overflow */
13938 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13939 	if (required_length > PAGE_SIZE) {
13940 		ret = KERN_ABORTED;
13941 		goto out;
13942 	}
13943 
13944 	/*
13945 	 * First we'll check if the code signature has enough space within the locked down
13946 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13947 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
13948 	 * free list.
13949 	 *
13950 	 * When we're storing the buffer within the code signature, we also need to make
13951 	 * sure we account for alignment of the buffer.
13952 	 */
13953 	const vm_address_t align_mask = sizeof(void*) - 1;
13954 	size_t required_length_within_sig = required_length + align_mask;
13955 
13956 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13957 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13958 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13959 
13960 		/* We need to resolve to the physical aperture */
13961 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13962 		acceleration_buf = (void*)phystokv(phys_addr);
13963 
13964 		/* Ensure the offset within the page wasn't lost */
13965 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13966 
13967 		acceleration_buf->allocated = false;
13968 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13969 	} else {
13970 		if (required_length <= pmap_cs_blob_limit) {
13971 			struct pmap_cs_blob *bucket = NULL;
13972 			size_t bucket_size = 0;
13973 
13974 			/* Allocate a buffer from the blob allocator */
13975 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13976 			if (ret != KERN_SUCCESS) {
13977 				goto out;
13978 			}
13979 			acceleration_buf = (void*)bucket->blob;
13980 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13981 		} else {
13982 			pmap_paddr_t phys_addr = 0;
13983 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13984 			if (ret != KERN_SUCCESS) {
13985 				goto out;
13986 			}
13987 			acceleration_buf = (void*)phystokv(phys_addr);
13988 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13989 		}
13990 		acceleration_buf->allocated = true;
13991 	}
13992 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13993 	acceleration_buf->length = acceleration_length;
13994 
13995 	/* Take the acceleration buffer lock */
13996 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13997 
13998 	/* Setup the global acceleration buffer state */
13999 	pmap_cs_acceleration_buf = acceleration_buf;
14000 
14001 	/* Accelerate the entitlements */
14002 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
14003 	if (ce_err != CoreEntitlements->kNoError) {
14004 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
14005 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
14006 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
14007 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
14008 	}
14009 
14010 	/*
14011 	 * The global acceleration buffer lock is unlocked by the allocation function itself
14012 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
14013 	 * an assert that the lock is unlocked here since another thread could have acquired
14014 	 * it by now.
14015 	 */
14016 	ret = KERN_SUCCESS;
14017 
14018 out:
14019 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
14020 	return ret;
14021 }
14022 
14023 kern_return_t
14024 pmap_accelerate_entitlements(
14025 	pmap_cs_code_directory_t *cd_entry)
14026 {
14027 	kern_return_t ret = KERN_DENIED;
14028 
14029 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
14030 	while (ret == KERN_RESOURCE_SHORTAGE) {
14031 		/* Allocate a page for the PPL */
14032 		pmap_alloc_page_for_ppl(0);
14033 
14034 		/* Try again */
14035 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
14036 	}
14037 
14038 	return ret;
14039 }
14040 
14041 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14042 
14043 MARK_AS_PMAP_TEXT bool
14044 pmap_lookup_in_loaded_trust_caches_internal(
14045 	const uint8_t cdhash[CS_CDHASH_LEN])
14046 {
14047 	kern_return_t kr = KERN_NOT_FOUND;
14048 
14049 #if PMAP_CS_PPL_MONITOR
14050 	/*
14051 	 * If we have the PPL monitor, then this function can only be called from
14052 	 * within the PPL. Calling it directly would've caused a panic, so we can
14053 	 * assume that we're in the PPL here.
14054 	 */
14055 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14056 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14057 
14058 	kr = pmap_query_trust_cache_safe(
14059 		kTCQueryTypeLoadable,
14060 		cdhash_safe,
14061 		NULL);
14062 #else
14063 	kr = query_trust_cache(
14064 		kTCQueryTypeLoadable,
14065 		cdhash,
14066 		NULL);
14067 #endif
14068 
14069 	if (kr == KERN_SUCCESS) {
14070 		return true;
14071 	}
14072 	return false;
14073 }
14074 
14075 bool
14076 pmap_lookup_in_loaded_trust_caches(
14077 	const uint8_t cdhash[CS_CDHASH_LEN])
14078 {
14079 #if XNU_MONITOR
14080 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14081 #else
14082 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14083 #endif
14084 }
14085 
14086 MARK_AS_PMAP_TEXT uint32_t
14087 pmap_lookup_in_static_trust_cache_internal(
14088 	const uint8_t cdhash[CS_CDHASH_LEN])
14089 {
14090 	TrustCacheQueryToken_t query_token = {0};
14091 	kern_return_t kr = KERN_NOT_FOUND;
14092 	uint64_t flags = 0;
14093 	uint8_t hash_type = 0;
14094 
14095 #if PMAP_CS_PPL_MONITOR
14096 	/*
14097 	 * If we have the PPL monitor, then this function can only be called from
14098 	 * within the PPL. Calling it directly would've caused a panic, so we can
14099 	 * assume that we're in the PPL here.
14100 	 */
14101 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14102 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14103 
14104 	kr = pmap_query_trust_cache_safe(
14105 		kTCQueryTypeStatic,
14106 		cdhash_safe,
14107 		&query_token);
14108 #else
14109 	kr = query_trust_cache(
14110 		kTCQueryTypeStatic,
14111 		cdhash,
14112 		&query_token);
14113 #endif
14114 
14115 	if (kr == KERN_SUCCESS) {
14116 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
14117 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14118 
14119 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14120 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14121 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14122 	}
14123 
14124 	return 0;
14125 }
14126 
14127 uint32_t
14128 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14129 {
14130 #if XNU_MONITOR
14131 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14132 #else
14133 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
14134 #endif
14135 }
14136 
14137 #if PMAP_CS_INCLUDE_CODE_SIGNING
14138 
14139 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14140 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14141 
14142 MARK_AS_PMAP_TEXT void
14143 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14144 {
14145 
14146 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14147 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14148 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14149 
14150 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14151 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14152 }
14153 
14154 MARK_AS_PMAP_TEXT bool
14155 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14156 {
14157 	bool match = false;
14158 
14159 	/* Lockdown mode disallows compilation service */
14160 	if (ppl_lockdown_mode_enabled == true) {
14161 		return false;
14162 	}
14163 
14164 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14165 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14166 		match = true;
14167 	}
14168 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14169 
14170 	if (match) {
14171 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14172 	}
14173 
14174 	return match;
14175 }
14176 
14177 void
14178 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14179 {
14180 #if XNU_MONITOR
14181 	pmap_set_compilation_service_cdhash_ppl(cdhash);
14182 #else
14183 	pmap_set_compilation_service_cdhash_internal(cdhash);
14184 #endif
14185 }
14186 
14187 bool
14188 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14189 {
14190 #if XNU_MONITOR
14191 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
14192 #else
14193 	return pmap_match_compilation_service_cdhash_internal(cdhash);
14194 #endif
14195 }
14196 
14197 /*
14198  * As part of supporting local signing on the device, we need the PMAP layer
14199  * to store the local signing key so that PMAP_CS can validate with it. We
14200  * store it at the PMAP layer such that it is accessible to both AMFI and
14201  * PMAP_CS should they need it.
14202  */
14203 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14204 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14205 
14206 MARK_AS_PMAP_TEXT void
14207 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14208 {
14209 	bool key_set = false;
14210 
14211 	/*
14212 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14213 	 * a successful exchange means that the local signing public key has _not_ been
14214 	 * set. In case the key has been set, we panic as we would never expect the
14215 	 * kernel to attempt to set the key more than once.
14216 	 */
14217 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14218 
14219 	if (key_set) {
14220 		panic("attempted to set the local signing public key multiple times");
14221 	}
14222 
14223 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14224 	pmap_cs_log_info("set local signing public key");
14225 }
14226 
14227 void
14228 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14229 {
14230 #if XNU_MONITOR
14231 	return pmap_set_local_signing_public_key_ppl(public_key);
14232 #else
14233 	return pmap_set_local_signing_public_key_internal(public_key);
14234 #endif
14235 }
14236 
14237 uint8_t*
14238 pmap_get_local_signing_public_key(void)
14239 {
14240 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14241 
14242 	if (key_set) {
14243 		return pmap_local_signing_public_key;
14244 	}
14245 
14246 	return NULL;
14247 }
14248 
14249 /*
14250  * Locally signed applications need to be explicitly authorized by an entitled application
14251  * before we allow them to run.
14252  */
14253 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14254 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14255 
14256 MARK_AS_PMAP_TEXT void
14257 pmap_unrestrict_local_signing_internal(
14258 	const uint8_t cdhash[CS_CDHASH_LEN])
14259 {
14260 
14261 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14262 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14263 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14264 
14265 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14266 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14267 }
14268 
14269 void
14270 pmap_unrestrict_local_signing(
14271 	const uint8_t cdhash[CS_CDHASH_LEN])
14272 {
14273 #if XNU_MONITOR
14274 	return pmap_unrestrict_local_signing_ppl(cdhash);
14275 #else
14276 	return pmap_unrestrict_local_signing_internal(cdhash);
14277 #endif
14278 }
14279 
14280 #if PMAP_CS
14281 MARK_AS_PMAP_TEXT static void
14282 pmap_restrict_local_signing(void)
14283 {
14284 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14285 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14286 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14287 }
14288 
14289 MARK_AS_PMAP_TEXT static bool
14290 pmap_local_signing_restricted(
14291 	const uint8_t cdhash[CS_CDHASH_LEN])
14292 {
14293 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14294 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14295 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14296 
14297 	return ret != 0;
14298 }
14299 
14300 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14301 #endif
14302 
14303 MARK_AS_PMAP_TEXT void
14304 pmap_footprint_suspend_internal(
14305 	vm_map_t        map,
14306 	boolean_t       suspend)
14307 {
14308 #if DEVELOPMENT || DEBUG
14309 	if (suspend) {
14310 		current_thread()->pmap_footprint_suspended = TRUE;
14311 		map->pmap->footprint_was_suspended = TRUE;
14312 	} else {
14313 		current_thread()->pmap_footprint_suspended = FALSE;
14314 	}
14315 #else /* DEVELOPMENT || DEBUG */
14316 	(void) map;
14317 	(void) suspend;
14318 #endif /* DEVELOPMENT || DEBUG */
14319 }
14320 
14321 void
14322 pmap_footprint_suspend(
14323 	vm_map_t map,
14324 	boolean_t suspend)
14325 {
14326 #if XNU_MONITOR
14327 	pmap_footprint_suspend_ppl(map, suspend);
14328 #else
14329 	pmap_footprint_suspend_internal(map, suspend);
14330 #endif
14331 }
14332 
14333 MARK_AS_PMAP_TEXT void
14334 pmap_nop_internal(pmap_t pmap __unused)
14335 {
14336 	validate_pmap_mutable(pmap);
14337 }
14338 
14339 void
14340 pmap_nop(pmap_t pmap)
14341 {
14342 #if XNU_MONITOR
14343 	pmap_nop_ppl(pmap);
14344 #else
14345 	pmap_nop_internal(pmap);
14346 #endif
14347 }
14348 
14349 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14350 
14351 struct page_table_dump_header {
14352 	uint64_t pa;
14353 	uint64_t num_entries;
14354 	uint64_t start_va;
14355 	uint64_t end_va;
14356 };
14357 
14358 static kern_return_t
14359 pmap_dump_page_tables_recurse(pmap_t pmap,
14360     const tt_entry_t *ttp,
14361     unsigned int cur_level,
14362     unsigned int level_mask,
14363     uint64_t start_va,
14364     void *buf_start,
14365     void *buf_end,
14366     size_t *bytes_copied)
14367 {
14368 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14369 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14370 
14371 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
14372 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14373 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14374 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14375 
14376 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
14377 
14378 	if (cur_level == pt_attr_root_level(pt_attr)) {
14379 		start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14380 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14381 	}
14382 
14383 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14384 	const tt_entry_t *tt_end = &ttp[num_entries];
14385 
14386 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14387 		return KERN_INSUFFICIENT_BUFFER_SIZE;
14388 	}
14389 
14390 	if (level_mask & (1U << cur_level)) {
14391 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14392 		header->pa = ml_static_vtop((vm_offset_t)ttp);
14393 		header->num_entries = num_entries;
14394 		header->start_va = start_va;
14395 		header->end_va = start_va + (num_entries * size);
14396 
14397 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14398 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14399 	}
14400 	uint64_t current_va = start_va;
14401 
14402 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14403 		tt_entry_t tte = *ttep;
14404 
14405 		if (!(tte & valid_mask)) {
14406 			continue;
14407 		}
14408 
14409 		if ((tte & type_mask) == type_block) {
14410 			continue;
14411 		} else {
14412 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14413 				panic("%s: corrupt entry %#llx at %p, "
14414 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14415 				    __FUNCTION__, tte, ttep,
14416 				    ttp, cur_level, bufp, buf_end);
14417 			}
14418 
14419 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14420 
14421 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14422 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
14423 
14424 			if (recurse_result != KERN_SUCCESS) {
14425 				return recurse_result;
14426 			}
14427 		}
14428 	}
14429 
14430 	return KERN_SUCCESS;
14431 }
14432 
14433 kern_return_t
14434 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14435 {
14436 	if (not_in_kdp) {
14437 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
14438 	}
14439 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14440 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
14441 }
14442 
14443 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14444 
14445 kern_return_t
14446 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14447     unsigned int level_mask __unused, size_t *bytes_copied __unused)
14448 {
14449 	return KERN_NOT_SUPPORTED;
14450 }
14451 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14452 
14453 
14454 #ifdef CONFIG_XNUPOST
14455 #ifdef __arm64__
14456 static volatile bool pmap_test_took_fault = false;
14457 
14458 static bool
14459 pmap_test_fault_handler(arm_saved_state_t * state)
14460 {
14461 	bool retval                 = false;
14462 	uint32_t esr                = get_saved_state_esr(state);
14463 	esr_exception_class_t class = ESR_EC(esr);
14464 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
14465 
14466 	if ((class == ESR_EC_DABORT_EL1) &&
14467 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14468 		pmap_test_took_fault = true;
14469 		/* return to the instruction immediately after the call to NX page */
14470 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14471 		retval = true;
14472 	}
14473 
14474 	return retval;
14475 }
14476 
14477 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14478 static NOKASAN bool
14479 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14480 {
14481 	pmap_t old_pmap = NULL;
14482 
14483 	pmap_test_took_fault = false;
14484 
14485 	/*
14486 	 * We're potentially switching pmaps without using the normal thread
14487 	 * mechanism; disable interrupts and preemption to avoid any unexpected
14488 	 * memory accesses.
14489 	 */
14490 	uint64_t old_int_state = pmap_interrupts_disable();
14491 	mp_disable_preemption();
14492 
14493 	if (pmap != NULL) {
14494 		old_pmap = current_pmap();
14495 		pmap_switch(pmap);
14496 
14497 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
14498 #if __ARM_PAN_AVAILABLE__
14499 		__builtin_arm_wsr("pan", 0);
14500 #endif /* __ARM_PAN_AVAILABLE__ */
14501 	}
14502 
14503 	ml_expect_fault_begin(pmap_test_fault_handler, va);
14504 
14505 	if (is_write) {
14506 		*((volatile uint64_t*)(va)) = 0xdec0de;
14507 	} else {
14508 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
14509 		(void)tmp;
14510 	}
14511 
14512 	/* Save the fault bool, and undo the gross stuff we did. */
14513 	bool took_fault = pmap_test_took_fault;
14514 	ml_expect_fault_end();
14515 
14516 	if (pmap != NULL) {
14517 #if __ARM_PAN_AVAILABLE__
14518 		__builtin_arm_wsr("pan", 1);
14519 #endif /* __ARM_PAN_AVAILABLE__ */
14520 
14521 		pmap_switch(old_pmap);
14522 	}
14523 
14524 	mp_enable_preemption();
14525 	pmap_interrupts_restore(old_int_state);
14526 	bool retval = (took_fault == should_fault);
14527 	return retval;
14528 }
14529 
14530 static bool
14531 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14532 {
14533 	bool retval = pmap_test_access(pmap, va, should_fault, false);
14534 
14535 	if (!retval) {
14536 		T_FAIL("%s: %s, "
14537 		    "pmap=%p, va=%p, should_fault=%u",
14538 		    __func__, should_fault ? "did not fault" : "faulted",
14539 		    pmap, (void*)va, (unsigned)should_fault);
14540 	}
14541 
14542 	return retval;
14543 }
14544 
14545 static bool
14546 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14547 {
14548 	bool retval = pmap_test_access(pmap, va, should_fault, true);
14549 
14550 	if (!retval) {
14551 		T_FAIL("%s: %s, "
14552 		    "pmap=%p, va=%p, should_fault=%u",
14553 		    __func__, should_fault ? "did not fault" : "faulted",
14554 		    pmap, (void*)va, (unsigned)should_fault);
14555 	}
14556 
14557 	return retval;
14558 }
14559 
14560 static bool
14561 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14562 {
14563 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14564 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14565 
14566 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14567 
14568 	if (!retval) {
14569 		T_FAIL("%s: bits=%u, "
14570 		    "pa=%p, should_be_set=%u",
14571 		    __func__, bits,
14572 		    (void*)pa, should_be_set);
14573 	}
14574 
14575 	return retval;
14576 }
14577 
14578 static __attribute__((noinline)) bool
14579 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14580 {
14581 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14582 	return retval;
14583 }
14584 
14585 static int
14586 pmap_test_test_config(unsigned int flags)
14587 {
14588 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
14589 	unsigned int map_count = 0;
14590 	unsigned long page_ratio = 0;
14591 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
14592 
14593 	if (!pmap) {
14594 		panic("Failed to allocate pmap");
14595 	}
14596 
14597 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14598 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14599 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14600 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14601 
14602 	if (pmap_page_size <= native_page_size) {
14603 		page_ratio = native_page_size / pmap_page_size;
14604 	} else {
14605 		/*
14606 		 * We claim to support a page_ratio of less than 1, which is
14607 		 * not currently supported by the pmap layer; panic.
14608 		 */
14609 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14610 		    "flags=%u",
14611 		    __func__, native_page_size, pmap_page_size,
14612 		    flags);
14613 	}
14614 
14615 	if (PAGE_RATIO > 1) {
14616 		/*
14617 		 * The kernel is deliberately pretending to have 16KB pages.
14618 		 * The pmap layer has code that supports this, so pretend the
14619 		 * page size is larger than it is.
14620 		 */
14621 		pmap_page_size = PAGE_SIZE;
14622 		native_page_size = PAGE_SIZE;
14623 	}
14624 
14625 	/*
14626 	 * Get two pages from the VM; one to be mapped wired, and one to be
14627 	 * mapped nonwired.
14628 	 */
14629 	vm_page_t unwired_vm_page = vm_page_grab();
14630 	vm_page_t wired_vm_page = vm_page_grab();
14631 
14632 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14633 		panic("Failed to grab VM pages");
14634 	}
14635 
14636 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14637 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14638 
14639 	pmap_paddr_t pa = ptoa(pn);
14640 	pmap_paddr_t wired_pa = ptoa(wired_pn);
14641 
14642 	/*
14643 	 * We'll start mappings at the second twig TT.  This keeps us from only
14644 	 * using the first entry in each TT, which would trivially be address
14645 	 * 0; one of the things we will need to test is retrieving the VA for
14646 	 * a given PTE.
14647 	 */
14648 	vm_map_address_t va_base = pmap_twig_size;
14649 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14650 
14651 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14652 		/*
14653 		 * Not exactly a functional failure, but this test relies on
14654 		 * there being a spare PTE slot we can use to pin the TT.
14655 		 */
14656 		panic("Cannot pin translation table");
14657 	}
14658 
14659 	/*
14660 	 * Create the wired mapping; this will prevent the pmap layer from
14661 	 * reclaiming our test TTs, which would interfere with this test
14662 	 * ("interfere" -> "make it panic").
14663 	 */
14664 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14665 
14666 #if XNU_MONITOR
14667 	/*
14668 	 * If the PPL is enabled, make sure that the kernel cannot write
14669 	 * to PPL memory.
14670 	 */
14671 	if (!pmap_ppl_disable) {
14672 		T_LOG("Validate that kernel cannot write to PPL memory.");
14673 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14674 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14675 	}
14676 #endif
14677 
14678 	/*
14679 	 * Create read-only mappings of the nonwired page; if the pmap does
14680 	 * not use the same page size as the kernel, create multiple mappings
14681 	 * so that the kernel page is fully mapped.
14682 	 */
14683 	for (map_count = 0; map_count < page_ratio; map_count++) {
14684 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14685 	}
14686 
14687 	/* Validate that all the PTEs have the expected PA and VA. */
14688 	for (map_count = 0; map_count < page_ratio; map_count++) {
14689 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14690 
14691 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14692 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14693 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14694 		}
14695 
14696 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14697 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14698 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14699 		}
14700 	}
14701 
14702 	T_LOG("Validate that reads to our mapping do not fault.");
14703 	pmap_test_read(pmap, va_base, false);
14704 
14705 	T_LOG("Validate that writes to our mapping fault.");
14706 	pmap_test_write(pmap, va_base, true);
14707 
14708 	T_LOG("Make the first mapping writable.");
14709 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14710 
14711 	T_LOG("Validate that writes to our mapping do not fault.");
14712 	pmap_test_write(pmap, va_base, false);
14713 
14714 
14715 	T_LOG("Make the first mapping execute-only");
14716 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14717 
14718 
14719 	T_LOG("Validate that reads to our mapping do not fault.");
14720 	pmap_test_read(pmap, va_base, false);
14721 
14722 	T_LOG("Validate that writes to our mapping fault.");
14723 	pmap_test_write(pmap, va_base, true);
14724 
14725 
14726 	/*
14727 	 * For page ratios of greater than 1: validate that writes to the other
14728 	 * mappings still fault.  Remove the mappings afterwards (we're done
14729 	 * with page ratio testing).
14730 	 */
14731 	for (map_count = 1; map_count < page_ratio; map_count++) {
14732 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14733 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14734 	}
14735 
14736 	T_LOG("Mark the page unreferenced and unmodified.");
14737 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14738 	pmap_test_check_refmod(pa, 0);
14739 
14740 	/*
14741 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
14742 	 * different protection/fault_type settings, and confirm that the
14743 	 * ref/mod state matches our expectations at each step.
14744 	 */
14745 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
14746 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14747 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14748 
14749 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
14750 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14751 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14752 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14753 
14754 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
14755 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14756 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14757 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14758 
14759 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
14760 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14761 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14762 
14763 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
14764 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14765 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14766 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14767 
14768 	/*
14769 	 * Shared memory testing; we'll have two mappings; one read-only,
14770 	 * one read-write.
14771 	 */
14772 	vm_map_address_t rw_base = va_base;
14773 	vm_map_address_t ro_base = va_base + pmap_page_size;
14774 
14775 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14776 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14777 
14778 	/*
14779 	 * Test that we take faults as expected for unreferenced/unmodified
14780 	 * pages.  Also test the arm_fast_fault interface, to ensure that
14781 	 * mapping permissions change as expected.
14782 	 */
14783 	T_LOG("!ref/!mod: expect no access");
14784 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14785 	pmap_test_read_write(pmap, ro_base, false, false);
14786 	pmap_test_read_write(pmap, rw_base, false, false);
14787 
14788 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14789 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14790 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14791 	pmap_test_read_write(pmap, ro_base, true, false);
14792 	pmap_test_read_write(pmap, rw_base, true, false);
14793 
14794 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14795 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14796 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14797 	pmap_test_read_write(pmap, ro_base, true, false);
14798 	pmap_test_read_write(pmap, rw_base, true, true);
14799 
14800 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14801 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14802 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14803 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14804 	pmap_test_read_write(pmap, ro_base, true, false);
14805 	pmap_test_read_write(pmap, rw_base, true, true);
14806 
14807 	T_LOG("RW protect both mappings; should not change protections.");
14808 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14809 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14810 	pmap_test_read_write(pmap, ro_base, true, false);
14811 	pmap_test_read_write(pmap, rw_base, true, true);
14812 
14813 	T_LOG("Read protect both mappings; RW mapping should become RO.");
14814 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14815 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14816 	pmap_test_read_write(pmap, ro_base, true, false);
14817 	pmap_test_read_write(pmap, rw_base, true, false);
14818 
14819 	T_LOG("RW protect the page; mappings should not change protections.");
14820 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14821 	pmap_page_protect(pn, VM_PROT_ALL);
14822 	pmap_test_read_write(pmap, ro_base, true, false);
14823 	pmap_test_read_write(pmap, rw_base, true, true);
14824 
14825 	T_LOG("Read protect the page; RW mapping should become RO.");
14826 	pmap_page_protect(pn, VM_PROT_READ);
14827 	pmap_test_read_write(pmap, ro_base, true, false);
14828 	pmap_test_read_write(pmap, rw_base, true, false);
14829 
14830 	T_LOG("Validate that disconnect removes all known mappings of the page.");
14831 	pmap_disconnect(pn);
14832 	if (!pmap_verify_free(pn)) {
14833 		T_FAIL("Page still has mappings");
14834 	}
14835 
14836 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
14837 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14838 	pmap_destroy(pmap);
14839 
14840 	T_LOG("Release the pages back to the VM.");
14841 	vm_page_lock_queues();
14842 	vm_page_free(unwired_vm_page);
14843 	vm_page_free(wired_vm_page);
14844 	vm_page_unlock_queues();
14845 
14846 	T_LOG("Testing successful!");
14847 	return 0;
14848 }
14849 #endif /* __arm64__ */
14850 
14851 kern_return_t
14852 pmap_test(void)
14853 {
14854 	T_LOG("Starting pmap_tests");
14855 #ifdef __arm64__
14856 	int flags = 0;
14857 	flags |= PMAP_CREATE_64BIT;
14858 
14859 #if __ARM_MIXED_PAGE_SIZE__
14860 	T_LOG("Testing VM_PAGE_SIZE_4KB");
14861 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14862 	T_LOG("Testing VM_PAGE_SIZE_16KB");
14863 	pmap_test_test_config(flags);
14864 #else /* __ARM_MIXED_PAGE_SIZE__ */
14865 	pmap_test_test_config(flags);
14866 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14867 
14868 #endif /* __arm64__ */
14869 	T_PASS("completed pmap_test successfully");
14870 	return KERN_SUCCESS;
14871 }
14872 #endif /* CONFIG_XNUPOST */
14873 
14874 /*
14875  * The following function should never make it to RELEASE code, since
14876  * it provides a way to get the PPL to modify text pages.
14877  */
14878 #if DEVELOPMENT || DEBUG
14879 
14880 #define ARM_UNDEFINED_INSN 0xe7f000f0
14881 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14882 
14883 /**
14884  * Forcibly overwrite executable text with an illegal instruction.
14885  *
14886  * @note Only used for xnu unit testing.
14887  *
14888  * @param pa The physical address to corrupt.
14889  *
14890  * @return KERN_SUCCESS on success.
14891  */
14892 kern_return_t
14893 pmap_test_text_corruption(pmap_paddr_t pa)
14894 {
14895 #if XNU_MONITOR
14896 	return pmap_test_text_corruption_ppl(pa);
14897 #else /* XNU_MONITOR */
14898 	return pmap_test_text_corruption_internal(pa);
14899 #endif /* XNU_MONITOR */
14900 }
14901 
14902 MARK_AS_PMAP_TEXT kern_return_t
14903 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14904 {
14905 	vm_offset_t va = phystokv(pa);
14906 	unsigned int pai = pa_index(pa);
14907 
14908 	assert(pa_valid(pa));
14909 
14910 	pvh_lock(pai);
14911 
14912 	pv_entry_t **pv_h  = pai_to_pvh(pai);
14913 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14914 #if defined(PVH_FLAG_EXEC)
14915 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14916 
14917 	if (need_ap_twiddle) {
14918 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14919 	}
14920 #endif /* defined(PVH_FLAG_EXEC) */
14921 
14922 	/*
14923 	 * The low bit in an instruction address indicates a THUMB instruction
14924 	 */
14925 	if (va & 1) {
14926 		va &= ~(vm_offset_t)1;
14927 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14928 	} else {
14929 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
14930 	}
14931 
14932 #if defined(PVH_FLAG_EXEC)
14933 	if (need_ap_twiddle) {
14934 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14935 	}
14936 #endif /* defined(PVH_FLAG_EXEC) */
14937 
14938 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14939 
14940 	pvh_unlock(pai);
14941 
14942 	return KERN_SUCCESS;
14943 }
14944 
14945 #endif /* DEVELOPMENT || DEBUG */
14946